Sunlight Harvesting Energy Conversion¶

In [ ]:
import os
import math
import random
import numpy as np
import polars as pl
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 5]
plt.rcParams["figure.autolayout"] = True
pl.Config.set_tbl_rows(256)

import nmf.nmf as nmf
import knee.kneedle as kneedle

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Remove SciKit Learn warnings
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

Dataset visualization and analysis¶

In [ ]:
df = pl.read_excel('db.xlsx')
df = df.drop(['#'])
df
Out[ ]:
shape: (201, 11)
mat0mat1abs_peakabs_minabs_maxem_peakem_minem_maxQY (%)hopt (%)PCE (%)
strstri64i64i64i64i64i64f64f64f64
"dye""film"57842060061355075098.018.8null
"dye""bulk"540400600590540690nullnullnull
"dye""bulk"540400600590540690nullnullnull
"QD""bulk"7306009009108001000null2.5null
"QD""bulk"7306009009108001000null2.5null
"QD""fiber"7306009009108001000null4.0null
"QD""fiber"7306009009108001000null7.0null
"QD""solution"800400800900700100030.01.43.2
"QD""solution"60040060063060070050.00.51.2
"dye""solution"550450600580550650nullnull1.3
"QD""solution"37630065062360070050.00.3null
"dye""solution"41340060065760080067.03.4null
"dye""solution"55045060062955075095.02.6null
"dye""solution"46630075068570090011.00.6null
"dye""solution"5573007508027009009.00.5null
"polymer""solution"46035055059255080045.01.0null
"polymer""solution"46730055059355070048.00.9null
"dye""film"40330045047140060040.87.7null
"dye""bulk"374300400450400600100.0nullnull
"dye""bulk"37030045065045075067.05.5null
"NP""bulk"470400550597500725null4.093.55
"dye""bulk"620400600653600800nullnull0.55
"dye""bulk"34030040054040070014.00.25null
"dye""bulk"34030040048040060078.00.4null
"dye""bulk"580300600620600800100.0null0.0018
"dye""film"74540085080870090025.0null0.61
"dye""film"74540090080870090025.0null1.24
"dye""film"74540085080870090025.0null0.54
"dye""film"74540090080870090025.0null1.41
"dye""bulk"52540065061050080097.6null2.6
"NP""bulk"37530045075040090045.04.251.33
"Lndye""fiber"540350550630500700null0.29null
"dye""bulk"380300500509480700null10.42.2
"NP""bulk"550400750800600100080.06.8null
"QD""film"450370600630600700nullnullnull
"QD""film"45040060083065097540.0nullnull
"QD""waveguide"65050077071852085063.01.75null
"QD""solution"65050077071852085063.03.67null
"QD""film"48030050061957067036.22.952.25
"CD""bulk"350300400435380575null12.232.63
"CD""bulk"350280400450390600null4.522.49
"CD""film"35030042551040065045.012.0null
"CD""film"35822542544140060094.03.9null
"CD""film"325300400430380600null5.024.97
"CD""bulk"34030060054045070040.00.92null
"CDQD""tandem"45030055062045070045.01.4null
"CDdye""bulk"470350550560380750nullnull4.06
"CDQD""bulk"40030050050040055070.0null3.05
"NP""film"40030050060050070025.01.85null
"NP""film"50040060053248060092.0nullnull
"NP""bulk"32530052051847552558.02.41.8
"CD""film"380280700420400650null4.753.94
"CD""bulk"44030050054045070025.01.2null
"QD""bulk"500400600830600100050.0null2.85
"QD""bulk"50035075062755070035.91.45null
"QD""film"450350750862800110091.08.12.94
"QD""bulk"35030050055045077581.026.58.71
"QD""bulk"350300500588450750nullnull4.2
"QD""film"350400800740600850null6.973.18
"QD""bulk"500350650805600100078.06.43.1
"QD""bulk"6404001000960700120040.0null3.27
"QD""bulk"415310620918620124060.3null3.94
"QD""bulk"375300500638500825nullnullnull
"QD""bulk"36032550060047570053.0nullnull
"Ln""bulk"650400900114010001200nullnullnull
"dye""bulk"44625050055350080089.531.3null
"dye""bulk"47825060060155080061.122.0null
"dye""bulk"51330065064265080024.83.3null
"dye""bulk"44925055057150080080.027.8null
"dye""bulk"47325060060750080044.324.7null
"dye""film"76060080078770090024.0null0.44
"dye""film"76060080078770090024.0null0.28
"dye""film"70055080078470090030.0null0.62
"dye""film"70055080078470090030.0null0.36
"dye""film"73855085081970090023.0null0.41
"dye""film"73855085081970090023.0null0.28
"QD""bulk"45035010008256501000nullnull7.9
"QD""bulk"763350950856650105070.0null4.74
"dye""bulk"49130050058155065095.023.72.81
"dye""bulk"580350600620400800nullnull7.1
"dye""bulk"45030035048045060017.0null8.99
"Ln""bulk"347250400613570710null2.470.19
"Ln""bulk"59028060063055070065.0null11.3
"Ln""film"38025040061257071030.50.340.0019
"Ln""film"3602503805454507001.60.270.00078
"Ln""film"37029038061157071027.03.20.007
"Ln""film"29028040054638065040.08.8null
"Ln""film"35024042061257071034.04.3null
"Ln""film"3252404006125707108.01.2null
"Ln""film"32524040054445065012.01.7null
"Ln""film"290200380610570710nullnullnull
"Ln""film"37024040061257071063.09.0null
"Ln""film"37030038061557071061.01.20.2
"Ln""film"38024038061257071023.00.430.03
"Ln""film"36024038061257071030.00.010.0006
"Ln""fiber"36024042061557071085.02.30.00086
"dye""film"52030054059055070078.00.08null
"dye""fiber"52030054059055070078.01.60.0052
"dye""fiber"52030054059055070093.08.00.0024
"Ln""film"34025038061357071044.0null0.0441
"Ln""film"40525042061357071044.0null0.0499
"Ln""film"35025040061357071073.00.280.28
"Ln""film"32025035061357071086.0nullnull
"dye""bulk"57030060065035075080.014.52.16
"dye""film"77030080077570095016.01.5null
"dye""bulk"66525070067060075012.03.70.1
"Ln""fiber"37030045061557071089.00.7null
"dye""fiber"56030060058055070095.02.1null
"dye""fiber"78030075073065085021.00.5null
"Ln""fiber"37030045061557071089.0null0.08
"dye""fiber"56030060058055070095.0null0.21
"QD""bulk"58050062063059070010.0null2.1
"QD""bulk"58050062063059070060.0null2.1
"dye""bulk"57540061062057072095.0null3.3
"QD""bulk"58050062063059070010.0null2.1
"QD""bulk"45040065063458068044.0null2.8
"QD""bulk"45040060058055065086.048.0null
"dye""film"570300700600350700nullnullnull
"QD""film"39635045058250070053.0nullnull
"Ln""bulk"53030058063055075083.0null1.44
"QD""bulk"47335055064060070045.01.0null
"QD""bulk"35030045055045075056.0null8.71
"QD""bulk"47040060055045070020.02.01null
"dye""bulk"52145055053950070093.054.0null
"CD""film"45031051052550070011.07.586.0
"dye""film"54540070068055080050.012.5null
"dye""solution"49830070058052070030.06.880.27
"dye""solution"56935060059555075061.02.58null
"dye""solution"48830050051045060051.03.30.35
"CD""bulk"46035050051035050054.02.7null
"CD""bulk"3403005005204007006.05.84null
"CD""bulk"72025080049040060065.0null8.75
"CD""film"42030055051540070040.01.60.7
"CD""film"55730060061240070070.02.3null
"CD""film"42030055050044060067.02.21.13
"dye""bulk"57540061062057070096.037.7null
"CD""solution"49125052052050070082.05.430.18
"CD""film"51025052053540065078.00.0580.00083
"CD""film"51025052053540065078.01.70.014
"dye""bulk"57540061062057072095.019.02.9
"QD""bulk"48040066062255070015.73.20.62
"CD""film"40530050052040070070.03.21.9
"CD""film"40530050052040070065.02.91.7
"QDdye""bulk"50040070060045075032.71.0null
"CD""film"34728050054045070061.04.564.1
"Ln""bulk"40525050052045070081.03.41.37
"CD""film"49030052051045065080.5null2.06
"CD""film"49030052051045065080.54.84.36
"CD""film"38030045045040070011.541.36null
"QD""film"30030055051545055035.913.08null
"QD""film"32030055067060070032.972.55null
"CDQD""film"32030055051540070023.01.89null
"CDQD""film"32030055067040070022.02.54null
"CDQD""film"32030055050040070026.03.76null
"CD""film"35030065058045075035.0null1.9
"CD""film"38030050052040070035.0null1.7
"CD""film"37030065056040077535.0null2.3
"CD""film"40030050051745070033.04.50.117
"CD""bulk"40030050051745070041.05.890.16
"CD""bulk"40030050051745070041.03.130.061
"CD""film"40430055059450075086.42.62.3
"CD""film"55530065065057580017.63.02.7
"CD""film"45030066060050080060.04.33.8
"CD""solution"340300450520400700null1.230.43
"CD""film"340300450520400700null0.90.62
"CD""bulk"4703005505354506509.69.3null
"CD""film"38030050057045070041.523.512.39
"CD""film"35530050052045065015.012.761.94
"CD""film"4103006006085507507.62.771.96
"CD""film"40030060060045075022.04.032.92
"QD""bulk"60040065062555070030.02.70.38
"QD""bulk"35030050049542570091.0null4.29
"QD""bulk"35030050049542570091.0null0.55
"QD""bulk"35030050057845070011.0null0.77
"Ln""film"45038525061572057044.04.80.054
"Ln""film"40027525054370047037.07.70.058
"CD""film"42534025042560040011.013.10.053
"Ln""film"45038525061572057044.05.20.046
"Ln""film"40027525054370047037.07.70.047
"CD""film"42534025042560040011.012.80.041
"Ln""film"45039025061272057059.06.70.074
"Ln""film"40028025054370047054.08.50.065
"Ln""film"45039025061272057059.010.70.096
"Ln""film"40028025054370047054.08.70.053
"Ln""film"45039025061272057059.011.70.142
"Ln""film"40028025054370047054.016.50.136
"dye""film"6504152506909006004.02.60.044
"polymer""film"50039225049165040047.05.712.29
"polymer""film"50039225047365045057.09.1122.32
"polymer""film"50039225045465040068.012.082.47
"polymer""film"50039225049165040047.05.714.38
"polymer""film"50039225047365045057.09.1124.62
"polymer""film"50039225045465040068.012.084.92
"NP""bulk"60051145052758049065.00.150.049413
"dye""bulk"60055445061270057570.00.160.050786
"NPdye""bulk"60053045055070050068.00.220.07531
"QD""solution"51045030051755047589.02.320.020944
"dye""solution"66048068066560080031.02.650.21
"Ln""film"37025040061257072060.00.020.000198
"CD""film"51025052053540065078.00.0350.000182
"Ln""film"37025040061257072060.00.0480.000471
In [ ]:
sns.pairplot(df.to_pandas())
fig = plt.gcf()
fig.savefig(f'figures/pairplot.pdf', bbox_inches='tight')
plt.show()
In [ ]:
sns.pairplot(df.to_pandas(), hue='PCE (%)')
plt.show()
In [ ]:
sns.pairplot(df.to_pandas(), hue='hopt (%)')
plt.show()
In [ ]:
def idx_to_rowcol(idx, width):
    r = int(idx / width)
    c = int(idx % width)
    return (r,c)


def rowcol_to_idx(r,c, width):
    idx = int((r * width) + c)
    return idx


def describe_variables(df, categorical=['mat0', 'mat1'], cols=4, figsize=(20,15), filename=None):
    # get the pandas dataframe (since polars does not allow plots)
    df_pandas = df.to_pandas()
    for col in categorical:
        df_pandas[col] = df_pandas[col].astype('category')
    
    # compute the number of rows
    total = (2 * len(df.get_columns())) - len(categorical)
    rows = math.ceil(total/cols)
    fig, axes = plt.subplots(rows, cols, constrained_layout = True, figsize=figsize)
    #print(f'Total {total} -> ({rows} {cols})')
    
    idx = 0
    for col in df.get_columns():
        r, c = idx_to_rowcol(idx, cols)
        #print(f'({col.name}: {idx} -> {r}, {c})')
        sns.histplot(ax=axes[r, c], data=df_pandas, x=col.name)
        idx += 1
        if col.name not in categorical:
            r, c = idx_to_rowcol(idx, cols)
            #print(f'({col.name}: {idx} -> {r}, {c})')
            sns.boxplot(ax=axes[r, c], data=df_pandas, x=col.name)
            idx += 1
    if isinstance(filename, str):
        fig = plt.gcf()
        fig.savefig(f'figures/{filename}.pdf', bbox_inches='tight')
    plt.show()
In [ ]:
describe_variables(df, filename='feaures_description')
In [ ]:
def print_missing_samples(df):
    print(f'Shape: {df.shape}')
    total, _ = df.shape
    for col in df.get_columns():
        print(f'{col.name:<6}: {col.is_null().sum()/total:.0%}')
In [ ]:
print_missing_samples(df)
Shape: (201, 11)
mat0  : 0%
mat1  : 0%
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 14%
hopt (%): 28%
PCE (%): 36%

PCE and hopt prediction¶

Baseline (using only abs and em)¶

In [ ]:
df_baseline_PCE = df.drop(['mat0', 'mat1', 'QY (%)', 'hopt (%)']).drop_nulls()
print_missing_samples(df_baseline_PCE)
Shape: (128, 7)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
PCE (%): 0%
In [ ]:
df_baseline_hopt = df.drop(['mat0', 'mat1', 'QY (%)', 'PCE (%)']).drop_nulls()
print_missing_samples(df_baseline_hopt)
Shape: (144, 7)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
hopt (%): 0%
In [ ]:
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)

# TODO: fix this
# Print Models parameters
models = [('Linear Regression', LinearRegression(), {}),
          ('K Neighbors', KNeighborsRegressor(),
           {'reg__n_neighbors':[1,3,5,7,9], 'reg__weights':['uniform', 'distance'], 'reg__p':[1,2]}),
          ('Random Forest', RandomForestRegressor(random_state=42),
           {'reg__n_estimators':[50, 100, 150, 200], 'reg__min_samples_split':[2, 5, 10],
            'reg__min_samples_leaf':[1, 2, 4], 'reg__bootstrap':[True, False], 'reg__max_depth':max_depth}),
          ('Gradient Boosting', GradientBoostingRegressor(random_state=42),
           {'reg__n_estimators':[50, 100, 150, 200],'reg__min_samples_split':[2, 5, 10],
            'reg__min_samples_leaf':[1, 2, 4], 'reg__max_depth':max_depth}),
         ('XGBoost', xgb.XGBRegressor(objective="reg:squarederror", random_state=42),
          {'reg__max_depth': range (2, 10, 1), 'reg__n_estimators': range(60, 220, 40), 'reg__learning_rate': [0.1, 0.01, 0.05]})]

# remove heavy models that do not perform that well
#('Lasso Regression', Lasso(max_iter=5000, tol=1E-2, random_state=42), {'alpha': (np.logspace(-8, 8, 20))}),
#('Support Vector Regressor', SVR(),{'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01],'kernel': ['linear','rbf', 'sigmoid']}),
#('ANN', MLPRegressor(max_iter=50000,  tol=1E-3, random_state=42),{'hidden_layer_sizes':[(8,), (16,), (32,)],'activation':['relu','logistic'], 'learning_rate':['constant', 'invscaling', 'adaptive']}),
In [ ]:
def create_folds(X, y, b='auto', k=5):
    if isinstance(b, str):
        bins = np.histogram_bin_edges(y, bins=b)
        # remove the last index (end point)
        bins = bins[:-1]
    elif isinstance(b, int):
        bins = np.linspace(min(y), max(y), num=b, endpoint=False)
    else:
        raise Exception(f'Undefined bins {b}')
        
    #print(f'Bins: {bins}')
    groups = np.digitize(y, bins)
    #print(f'Group: {groups}')
    skf = StratifiedKFold(n_splits=k)
    return skf.split(X, groups)
In [ ]:
from collections.abc import Iterable

def compute_performance(models, X, y, b=None, k=[5, 3], filename=None):
    if isinstance(k, Iterable):
        outer_k = k[0]
        inner_k = k[1]
    elif isinstance(k, int):
        outer_k = k
        inner_k = k
    else:
        raise Exception(f'Undefined k {k}')

    #kf = KFold(n_splits=cv)
    folds = create_folds(X, y, b=b, k=outer_k)
    
    perf_per_model = {}
    y_true = []
    y_true_train = []
    y_preds = {}
    y_preds_train = {}
    #for train_index, test_index in kf.split(X):
    for train_index, test_index in folds:
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    
        y_true.extend(y_test)
        y_true_train.extend(y_train)
    
        # optimize each model and store the best result
        best_models = []
        for _, model, params in models:
            pipeline = Pipeline([('sca', StandardScaler()),('reg', model)])
            #clf = GridSearchCV(pipeline, params, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
            grid_folds = create_folds(X_train, y_train, b=b, k=inner_k)
            #clf = GridSearchCV(pipeline, params, cv=cv, scoring='r2', n_jobs=-1)
            clf = GridSearchCV(pipeline, params, cv=grid_folds, scoring='r2', n_jobs=-1)
            clf.fit(X_train, y_train)
            best_models.append(clf.best_estimator_)

        # compute the predictions and store the results
        for i in range(len(models)):
            model = best_models[i]
            y_pred = model.predict(X_test)
            y_pred_train = model.predict(X_train)
            
            if models[i][0] not in y_preds:
                y_preds[models[i][0]] = []
            
            if models[i][0] not in y_preds_train:
                y_preds_train[models[i][0]] = []
            
            y_preds[models[i][0]].extend(y_pred)
            y_preds_train[models[i][0]].extend(y_pred_train)
            
            score_mae_test = mean_absolute_error(y_test, y_pred)
            score_mse_test = mean_squared_error(y_test, y_pred)
            score_r2_test = r2_score(y_test, y_pred)
            
            score_mae_train = mean_absolute_error(y_train, y_pred_train)
            score_mse_train = mean_squared_error(y_train, y_pred_train)
            score_r2_train = r2_score(y_train, y_pred_train)
            
            if models[i][0] not in perf_per_model:
                perf_per_model[models[i][0]] = {'train':{'mae':[], 'mse': [], 'r2':[]},
                                                'test':{'mae':[], 'mse': [], 'r2':[]}}
            
            perf_per_model[models[i][0]]['test']['mae'].append(score_mae_test)
            perf_per_model[models[i][0]]['test']['mse'].append(score_mse_test)
            perf_per_model[models[i][0]]['test']['r2'].append(score_r2_test)
            
            perf_per_model[models[i][0]]['train']['mae'].append(score_mae_train)
            perf_per_model[models[i][0]]['train']['mse'].append(score_mse_train)
            perf_per_model[models[i][0]]['train']['r2'].append(score_r2_train)
            
    # print the results
    print(f'Train')
    print(f'| {"Model":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    for i in range(len(models)):
        mae = perf_per_model[models[i][0]]['train']['mae']
        mse = perf_per_model[models[i][0]]['train']['mse']
        r2  = perf_per_model[models[i][0]]['train']['r2']
        #print(f'{mae} | {mse} | {r2}')
        mae = np.mean(mae)
        mse = np.mean(mse)
        r2  = np.mean(r2)
        print(f'| {models[i][0]:<17} | {round(mae, 2):>6} | {round(mse, 2):>6} | {round(r2, 2):>6} |')
    print()
    
    print(f'Test')
    print(f'| {"Model":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    for i in range(len(models)):
        mae = perf_per_model[models[i][0]]['test']['mae']
        mse = perf_per_model[models[i][0]]['test']['mse']
        r2  = perf_per_model[models[i][0]]['test']['r2']
        #print(f'{mae} | {mse} | {r2}')
        mae = np.mean(mae)
        mse = np.mean(mse)
        r2  = np.mean(r2)
        print(f'| {models[i][0]:<17} | {round(mae, 2):>6} | {round(mse, 2):>6} | {round(r2, 2):>6} |')
    print()

    # plot the graphs
    print(f'Train')
    data_train = []
    labels_train = []
    for name, _, _ in models:
        labels_train.append(name)

        # DEBUG STUFF:
        #score_mae = mean_absolute_error(y_true_train, y_preds_train[name])
        #score_mse = mean_squared_error(y_true_train, y_preds_train[name])
        #score_r2 = r2_score(y_true_train, y_preds_train[name])
        #print(f'Model {name} {score_mae} {score_mse} {score_r2}')

        data_train.append(np.abs(np.subtract(y_true_train, y_preds_train[name])))
    ax = sns.violinplot(data=data_train)
    ax.set_xticklabels(labels_train)
    if isinstance(filename, str):
        fig = plt.gcf()
        fig.savefig(f'figures/{filename}_train.pdf', bbox_inches='tight')
    plt.show()

    print(f'Test')
    data_preds = []
    labels_preds = []
    for name, _, _ in models:
        labels_preds.append(name)

        # DEBUG STUFF:
        #score_mae = mean_absolute_error(y_true, y_preds[name])
        #score_mse = mean_squared_error(y_true, y_preds[name])
        #score_r2 = r2_score(y_true, y_preds[name])
        #print(f'Model {name} {score_mae} {score_mse} {score_r2}')

        data_preds.append(np.abs(np.subtract(y_true, y_preds[name])))
    ax = sns.violinplot(data=data_preds)
    ax.set_xticklabels(labels_preds)
    if isinstance(filename, str):
        fig = plt.gcf()
        fig.savefig(f'figures/{filename}_test.pdf', bbox_inches='tight')
    plt.show()
    
    # convert best_models to a dict and return it
    bm = {}
    for name, model, params in models:
        # train on whole dataset
        pipeline = Pipeline([('sca', StandardScaler()),('reg', model)])
        #clf = GridSearchCV(pipeline, params, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_folds = create_folds(X, y, b=b, k=inner_k)
        clf = GridSearchCV(pipeline, params, cv=grid_folds, scoring='r2', n_jobs=-1)
        clf.fit(X, y)
        bm[name] = clf.best_estimator_
    
    return bm
In [ ]:
# Get Output variables
PCE = df_baseline_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]
#print(f'{y_PCE} {y_PCE.shape}')

# Get the Input variables
df_input = df_baseline_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()
#print(f'{X} {X.shape}')

## PCE
print(f'PCE')
models_pce_baseline = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_baseline')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.53 |   4.39 |   0.13 |
| K Neighbors       |   0.22 |   0.46 |   0.91 |
| Random Forest     |   0.85 |   1.95 |   0.61 |
| Gradient Boosting |   0.25 |   0.26 |   0.95 |
| XGBoost           |   0.76 |   1.56 |   0.69 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.66 |   5.15 |  -0.01 |
| K Neighbors       |   1.31 |    4.1 |    0.2 |
| Random Forest     |   1.28 |   4.04 |   0.25 |
| Gradient Boosting |    1.3 |   4.42 |   0.16 |
| XGBoost           |   1.32 |   4.26 |    0.2 |

Train
Test
In [ ]:
# Get Output variables
hopt = df_baseline_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_baseline_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()

## hopt
print(f'hopt')
models_hopt_baseline = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_baseline')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   5.14 |  65.18 |   0.07 |
| K Neighbors       |   0.98 |   8.88 |   0.87 |
| Random Forest     |   3.09 |  28.37 |    0.6 |
| Gradient Boosting |   1.13 |   4.37 |   0.94 |
| XGBoost           |   3.82 |  49.78 |    0.3 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   5.71 |  75.71 |  -0.15 |
| K Neighbors       |   4.69 |  62.34 |  -0.05 |
| Random Forest     |   5.19 |  68.62 |  -0.12 |
| Gradient Boosting |   4.76 |  67.24 |  -0.28 |
| XGBoost           |   4.54 |  66.16 |   0.02 |

Train
Test

Baseline + QY¶

In [ ]:
df_02_PCE = df.drop(['mat0', 'mat1', 'hopt (%)']).drop_nulls()
print_missing_samples(df_02_PCE)
Shape: (112, 8)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
PCE (%): 0%
In [ ]:
df_02_hopt = df.drop(['mat0', 'mat1', 'PCE (%)']).drop_nulls()
print_missing_samples(df_02_hopt)
Shape: (129, 8)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
hopt (%): 0%
In [ ]:
# Get Output variables
PCE = df_02_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]

# Get the Input variables
df_input = df_02_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()

## PCE
print(f'PCE')
models_02_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_qy')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.48 |   4.21 |   0.13 |
| K Neighbors       |   0.12 |   0.15 |   0.97 |
| Random Forest     |   0.78 |   1.78 |   0.63 |
| Gradient Boosting |   0.26 |   0.28 |   0.94 |
| XGBoost           |   1.01 |   2.76 |   0.42 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.64 |   4.99 |  -0.17 |
| K Neighbors       |   1.11 |    3.8 |   0.17 |
| Random Forest     |   1.32 |   4.37 |  -0.23 |
| Gradient Boosting |   1.44 |   5.48 |  -0.64 |
| XGBoost           |   1.43 |   5.41 |  -0.32 |

Train
Test
In [ ]:
# Get Output variables
hopt = df_02_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_02_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()

## hopt
print(f'hopt')
models_02_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_qy')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   5.25 |  62.42 |   0.18 |
| K Neighbors       |   2.38 |  23.96 |   0.68 |
| Random Forest     |   3.43 |  36.95 |   0.52 |
| Gradient Boosting |    0.8 |   3.32 |   0.95 |
| XGBoost           |   3.37 |  37.93 |   0.51 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   5.86 |  73.91 |  -0.39 |
| K Neighbors       |   4.56 |  50.99 |  -0.01 |
| Random Forest     |   4.97 |  64.28 |  -0.26 |
| Gradient Boosting |   4.95 |  70.14 |  -0.62 |
| XGBoost           |   4.21 |   63.4 |  -0.04 |

Train
Test

Baseline + QY + Categorical (Mat0 and Mat1)¶

In [ ]:
df_03_PCE = df.drop(['hopt (%)']).drop_nulls()
print_missing_samples(df_03_PCE)
Shape: (112, 10)
mat0  : 0%
mat1  : 0%
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
PCE (%): 0%
In [ ]:
df_03_hopt = df.drop(['PCE (%)']).drop_nulls()
print_missing_samples(df_03_hopt)
Shape: (129, 10)
mat0  : 0%
mat1  : 0%
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
hopt (%): 0%
In [ ]:
# Get Output variables
PCE = df_03_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]

# Get the Input variables
df_input = df_03_PCE.drop(['mat0', 'mat1', 'PCE (%)'])
X = df_input.to_numpy()

temp_mat0 = df_03_PCE[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)

temp_mat1 = df_03_PCE[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)

X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)

## PCE
print(f'PCE')
models_03_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_qy_mat')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.17 |   3.16 |   0.34 |
| K Neighbors       |   0.12 |   0.15 |   0.97 |
| Random Forest     |   0.79 |   1.82 |   0.62 |
| Gradient Boosting |   0.27 |   0.28 |   0.94 |
| XGBoost           |   1.01 |   2.77 |   0.42 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.44 |   4.72 |  -0.34 |
| K Neighbors       |    1.1 |   4.02 |   0.03 |
| Random Forest     |    1.3 |   4.35 |   -0.2 |
| Gradient Boosting |   1.29 |   5.35 |  -0.53 |
| XGBoost           |   1.38 |   4.91 |  -0.11 |

Train
Test
In [ ]:
# Get Output variables
hopt = df_03_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_03_hopt.drop(['mat0', 'mat1', 'hopt (%)'])
X = df_input.to_numpy()

temp_mat0 = df_03_hopt[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)

temp_mat1 = df_03_hopt[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)

X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)

## hopt
print(f'hopt')
models_03_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_qy_mat')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   4.71 |  47.33 |   0.38 |
| K Neighbors       |   0.99 |   9.13 |   0.89 |
| Random Forest     |   2.27 |  17.14 |   0.78 |
| Gradient Boosting |   0.49 |   1.48 |   0.98 |
| XGBoost           |   2.04 |  11.67 |   0.85 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   5.62 |  64.31 |  -0.26 |
| K Neighbors       |   4.12 |  47.73 |   0.15 |
| Random Forest     |   4.06 |  43.72 |   0.19 |
| Gradient Boosting |   4.52 |  60.79 |  -0.07 |
| XGBoost           |   3.67 |  40.66 |    0.3 |

Train
Test

Outlier removal¶

In [ ]:
def iqr_range(array):
    Q1 = np.percentile(array, 25)
    Q3 = np.percentile(array, 75)
    IQR = Q3 - Q1
    return ((Q1 - 1.5 * IQR),(Q3 + 1.5 * IQR))
In [ ]:
PCE_array = df[['PCE (%)']].drop_nulls().to_numpy()[:,0]
#PCE_array
pce_low, pce_high = iqr_range(PCE_array)
print(f'PCE [{pce_low}, {pce_high}]')
PCE [-4.000000000000001, 6.912000000000001]
In [ ]:
df_filtered_PCE = df.filter((pl.col('PCE (%)') > pce_low) & (pl.col('PCE (%)') < pce_high))
sns.boxplot(data=df_filtered_PCE.to_pandas(), x='PCE (%)')
plt.show()
print(f'PCE: {df_filtered_PCE.shape}')
PCE: (121, 11)
In [ ]:
hopt_array = df[['hopt (%)']].drop_nulls().to_numpy()[:,0]
#hopt_array
hopt_low, hopt_high = iqr_range(hopt_array)
print(f'hopt [{hopt_low}, {hopt_high}]')
hopt [-6.99875, 15.63125]
In [ ]:
df_filtered_hopt = df.filter((pl.col('hopt (%)') > pce_low) & (pl.col('hopt (%)') < pce_high))
sns.boxplot(data=df_filtered_hopt.to_pandas(), x='hopt (%)')
plt.show()
print(f'nhop: {df_filtered_hopt.shape}')
nhop: (106, 11)

PCE and hopt baselines¶

In [ ]:
#models_gbr = [('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42),
#{'n_estimators':[50, 100, 150],'min_samples_split':[2, 5, 10], 'min_samples_leaf':[1, 2, 4],'max_depth':max_depth})]
In [ ]:
df_out_baseline_PCE = df_filtered_PCE.drop(['mat0', 'mat1', 'QY (%)', 'hopt (%)']).drop_nulls()
print_missing_samples(df_out_baseline_PCE)
Shape: (121, 7)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
PCE (%): 0%
In [ ]:
# Get Output variables
PCE = df_out_baseline_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_baseline_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()

## PCE
print(f'PCE')
models_out_pce_baseline = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_baseline')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.15 |    2.0 |   0.14 |
| K Neighbors       |   0.26 |   0.39 |   0.83 |
| Random Forest     |   0.45 |   0.46 |    0.8 |
| Gradient Boosting |   0.17 |   0.15 |   0.93 |
| XGBoost           |   0.54 |   0.69 |    0.7 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.25 |   2.35 |  -0.01 |
| K Neighbors       |   0.99 |   1.94 |    0.1 |
| Random Forest     |   0.91 |   1.62 |   0.25 |
| Gradient Boosting |   0.79 |   1.53 |   0.28 |
| XGBoost           |   1.05 |   2.08 |   0.03 |

Train
Test
In [ ]:
df_out_baseline_hopt = df_filtered_hopt.drop(['mat0', 'mat1', 'QY (%)', 'PCE (%)']).drop_nulls()
print_missing_samples(df_out_baseline_hopt)
Shape: (106, 7)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
hopt (%): 0%
In [ ]:
# Get Output variables
hopt = df_out_baseline_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_baseline_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()

## hopt
print(f'hopt')
models_out_hopt_baseline = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_baseline')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.43 |   3.11 |   0.04 |
| K Neighbors       |    0.1 |   0.09 |   0.97 |
| Random Forest     |   0.85 |   1.11 |   0.66 |
| Gradient Boosting |   0.32 |   0.22 |   0.93 |
| XGBoost           |   1.07 |   1.76 |   0.46 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.59 |   3.86 |  -0.18 |
| K Neighbors       |   1.36 |   2.88 |   0.09 |
| Random Forest     |   1.44 |   3.04 |   0.04 |
| Gradient Boosting |    1.5 |   3.39 |  -0.06 |
| XGBoost           |   1.45 |   3.18 |   0.01 |

Train
Test

Baseline + QY¶

In [ ]:
df_out_02_PCE = df_filtered_PCE.drop(['mat0', 'mat1', 'hopt (%)']).drop_nulls()
print_missing_samples(df_out_02_PCE)
Shape: (107, 8)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
PCE (%): 0%
In [ ]:
df_out_02_hopt = df_filtered_hopt.drop(['mat0', 'mat1', 'PCE (%)']).drop_nulls()
print_missing_samples(df_out_02_hopt)
Shape: (95, 8)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
hopt (%): 0%
In [ ]:
# Get Output variables
PCE = df_out_02_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_02_PCE.drop(['PCE (%)'])
X = df_input.to_numpy()

## PCE
print(f'PCE')
models_out_02_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_qy')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.11 |   1.91 |   0.12 |
| K Neighbors       |   0.12 |   0.14 |   0.93 |
| Random Forest     |   0.39 |   0.37 |   0.83 |
| Gradient Boosting |   0.16 |   0.16 |   0.93 |
| XGBoost           |   0.56 |   0.71 |   0.67 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.27 |   2.42 |  -0.15 |
| K Neighbors       |   0.99 |   1.97 |   0.11 |
| Random Forest     |   0.91 |   1.65 |   0.24 |
| Gradient Boosting |   0.96 |   2.03 |   0.05 |
| XGBoost           |   0.97 |   1.85 |   0.15 |

Train
Test
In [ ]:
# Get Output variables
hopt = df_out_02_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_02_hopt.drop(['hopt (%)'])
X = df_input.to_numpy()

## hopt
print(f'hopt')
models_out_02_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_qy')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.45 |   3.18 |   0.04 |
| K Neighbors       |   0.48 |   0.87 |   0.74 |
| Random Forest     |   0.66 |   0.74 |   0.78 |
| Gradient Boosting |   0.21 |   0.13 |   0.96 |
| XGBoost           |   0.97 |   1.52 |   0.54 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.63 |   4.12 |   -0.3 |
| K Neighbors       |   1.36 |   2.91 |   0.07 |
| Random Forest     |    1.4 |   2.92 |   0.07 |
| Gradient Boosting |   1.47 |   3.55 |  -0.14 |
| XGBoost           |   1.42 |   3.15 |   0.01 |

Train
Test

Baseline + QY + Categorical (Mat0 and Mat1)¶

In [ ]:
df_out_03_PCE = df_filtered_PCE.drop(['hopt (%)']).drop_nulls()
print_missing_samples(df_out_03_PCE)
Shape: (107, 10)
mat0  : 0%
mat1  : 0%
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
PCE (%): 0%
In [ ]:
df_out_03_hopt = df_filtered_hopt.drop(['PCE (%)']).drop_nulls()
print_missing_samples(df_out_03_hopt)
Shape: (95, 10)
mat0  : 0%
mat1  : 0%
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
hopt (%): 0%
In [ ]:
# Get Output variables
PCE = df_out_03_PCE[['PCE (%)']]
y_PCE = PCE.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_03_PCE.drop(['mat0', 'mat1', 'PCE (%)'])
X = df_input.to_numpy()

temp_mat0 = df_out_03_PCE[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)

temp_mat1 = df_out_03_PCE[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)

X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)

## PCE
print(f'PCE')
models_out_03_pce = compute_performance(models, X, y_PCE, b='auto', k=[9, 3], filename='pce_nout_qy_mat')
PCE
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   0.79 |   1.15 |   0.47 |
| K Neighbors       |   0.17 |   0.21 |    0.9 |
| Random Forest     |   0.35 |    0.3 |   0.86 |
| Gradient Boosting |   0.14 |   0.15 |   0.93 |
| XGBoost           |    0.4 |   0.45 |   0.79 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.02 |    1.8 |   0.16 |
| K Neighbors       |   0.88 |   1.72 |   0.22 |
| Random Forest     |   0.88 |   1.56 |   0.28 |
| Gradient Boosting |   0.96 |   1.99 |   0.06 |
| XGBoost           |    1.0 |   2.16 |  -0.02 |

Train
Test
In [ ]:
# Get Output variables
hopt = df_out_03_hopt[['hopt (%)']]
y_hopt = hopt.to_numpy()[:,0]

# Get the Input variables
df_input = df_out_03_hopt.drop(['mat0', 'mat1', 'hopt (%)'])
X = df_input.to_numpy()

temp_mat0 = df_out_03_hopt[['mat0']].to_numpy()
enc = OneHotEncoder(min_frequency=10, sparse_output=False)
encoded_mat0 = enc.fit_transform(temp_mat0)

temp_mat1 = df_out_03_hopt[['mat1']].to_numpy()
enc = OneHotEncoder(min_frequency=20, sparse_output=False)
encoded_mat1 = enc.fit_transform(temp_mat1)

X = np.concatenate((encoded_mat0, encoded_mat1, X), axis=1)

## PCE
print(f'hopt')
models_out_03_hopt = compute_performance(models, X, y_hopt, b='auto', k=[9, 3], filename='hopt_nout_qy_mat')
hopt
Train
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.36 |   2.84 |   0.14 |
| K Neighbors       |   0.21 |   0.36 |   0.89 |
| Random Forest     |   0.63 |   0.64 |   0.81 |
| Gradient Boosting |   0.18 |   0.11 |   0.97 |
| XGBoost           |   0.97 |   1.53 |   0.54 |

Test
|       Model       |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Linear Regression |   1.71 |   4.66 |  -0.48 |
| K Neighbors       |    1.4 |   3.05 |   0.06 |
| Random Forest     |   1.41 |   3.02 |   0.04 |
| Gradient Boosting |   1.49 |    3.6 |  -0.12 |
| XGBoost           |   1.43 |   3.28 |  -0.04 |

Train
Test

Clustering¶

In [ ]:
df = pl.read_excel('db.xlsx')
#df = df.drop(['#'])
df_clustering_index = df.drop(['mat0', 'mat1', 'hopt (%)', 'PCE (%)'])
df_clustering_index = df_clustering_index.drop_nulls()
df_clustering = df_clustering_index.drop(['#'])
print_missing_samples(df_clustering)
Shape: (173, 7)
abs_peak: 0%
abs_min: 0%
abs_max: 0%
em_peak: 0%
em_min: 0%
em_max: 0%
QY (%): 0%
In [ ]:
import sklearn.metrics as sklearn_metrics

def inertia_score(X, labels):
  """This is the same of scikit-learn's KMeans.inertia_, but it works also in case of only one label."""

  inertia = 0.0
  for label in set(labels):
    X_cluster = X[labels == label, :]
    centroid = np.mean(X_cluster, axis=0)
    inertia += np.sum((X_cluster - centroid) ** 2)
  return inertia


def calinski_harabasz_score(X, labels):
  """Wrapper function of Scikit-learn's calinski_harabasz_score. The only difference is it doesn't throw an error where there is only one label."""
  
  if len(set(labels)) == 1:
    return float("NaN")
  else:
    return sklearn_metrics.calinski_harabasz_score(X, labels)


def davies_bouldin_score(X, labels):
  """Wrapper function of Scikit-learn's davies_bouldin_score. The only difference is it doesn't throw an error where there is only one label."""
  
  if len(set(labels)) == 1:
    return float("NaN")
  else:
    return sklearn_metrics.davies_bouldin_score(X, labels)


def silhouette_score(X, labels):
  """Wrapper function of Scikit-learn's silhouette_score. The only difference is it doesn't throw an error where there is only one label."""
  
  if len(set(labels)) == 1:
    return float("NaN")
  else:
    return sklearn_metrics.silhouette_score(X, labels)


def get_bic_aic(k, X):
    gmm = GaussianMixture(n_components=k, init_params='kmeans')
    gmm.fit(X)
    return gmm.bic(X), gmm.aic(X)
In [ ]:
# Cluster the materials without the hopt and PCE
X = df_clustering.to_numpy()
print(f'X = {X.shape}')

scores = {'elbow': [], 'calinski-harabasz': [], 'davies-bouldin': [], 'silhouette': [], 'bic': [], }

min_k = 2
max_k = 20

for k in range(min_k, max_k):
    kmeans = KMeans(n_clusters=k, max_iter=1000, n_init='auto', init='k-means++', random_state=5).fit(X)
    labels = kmeans.labels_
    scores['elbow'].append(inertia_score(X, labels))
    scores['calinski-harabasz'].append(calinski_harabasz_score(X, labels))
    scores['davies-bouldin'].append(davies_bouldin_score(X, labels))
    scores['silhouette'].append(silhouette_score(X, labels))
    bic, _ = get_bic_aic(k, X)
    scores['bic'].append(bic)
X = (173, 7)
In [ ]:
x = range(min_k, max_k)
y = scores['elbow']

points = np.zeros((len(y),2))
points[:,0] = x
points[:,1] = y
In [ ]:
methods = ['elbow', 'calinski-harabasz', 'davies-bouldin', 'silhouette', 'bic']
fig, axs = plt.subplots(1, 5)

for i in range(len(methods)):
    method = methods[i]
    
    axs[i].plot(range(min_k, max_k), scores[method])
    if method == 'elbow':
        idx = kneedle.auto_knee(points)
    elif method == 'calinski-harabasz':
        idx = scores[method].index(max(scores[method]))
        #axs[i].plot(x[idx], scores[method][idx], 'ro')
    elif method == 'davies-bouldin':
        idx = scores[method].index(min(scores[method]))
        #axs[i].plot(x[idx], scores[method][idx], 'ro')
    elif method == 'silhouette':
        idx = scores[method].index(max(scores[method]))
        #axs[i].plot(x[idx], scores[method][idx], 'ro')
    elif method == 'bic':
        idx = scores[method].index(max(scores[method]))
        #axs[i].plot(x[idx], scores[method][idx], 'ro')
    axs[i].plot(x[idx], scores[method][idx], 'ro')
fig = plt.gcf()
fig.savefig(f'figures/kmeans.pdf', bbox_inches='tight')
plt.show()
In [ ]:
def compute_avg_field_per_cluster(df_origin, df_clustering, labels, field='PCE (%)'):
    field_per_cluster = []
    min_k = min(labels)
    max_k = max(labels)
    #print(f'[{min_k}, {max_k}]')
    for k in range(min_k, max_k+1):
        mask = (labels == k)
        #print(f'k={k} = {mask} {mask.shape} {df.shape}')
        df_k = df_clustering.filter(mask)
        #print(f'{df_k}')
        cluster_labels = df_k['#'].to_list()
        #print(f'{cluster_labels}')
        filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
        #print(filter_df)
        mean_field = filter_df[field].mean()
        field_per_cluster.append(mean_field)
    return field_per_cluster

def plot_stats_per_cluster(df_origin, df_clustering, labels):
    min_k = min(labels)
    max_k = max(labels)
    #print(f'[{min_k}, {max_k}]')
    for k in range(min_k, max_k+1):
        print(f'Cluster: {k}')
        mask = (labels == k)
        #print(f'k={k} = {mask} {mask.shape} {df.shape}')
        df_k = df_clustering.filter(mask)
        #print(f'{df_k}')
        cluster_labels = df_k['#'].to_list()
        #print(f'{cluster_labels}')
        filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
        describe_variables(filter_df.drop(['#']), filename=f'cluster_{k}')

def find_nearest(array, value):
    diff = np.sum(np.abs(array - value), axis=1)
    idx = diff.argmin()
    return idx

def find_nearest_df(df, df_origin, value, k, field):
    df_no_id = df.drop(['#'])
    array = df_no_id.to_numpy()
    diff = np.sum(np.abs(array - value), axis=1)
    idxs = []
    
    i = j = 0
    while i < k and j < len(diff):
        j += 1
        idx = diff.argmin()
        #print(f'Current idx {idx}')
        diff[idx] = float('inf')
        #print(f'{df_origin.row(idx, named=True)}')
        if df_origin.row(idx, named=True)[field] is not None:
            idxs.append(idx)
            i += 1
    
    idxs = [df.row(i, named=True)['#'] for i in idxs]
    return idxs

def get_missing(df, field):
    df_qy_not_null = df.filter(pl.col('QY (%)').is_not_null())
    missing_df = df_qy_not_null.filter(pl.col(field).is_null())
    missing = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)']).to_numpy()
    return missing
    #return missing_df
    #vector_df = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)'])
    #missing = vector_df.to_numpy()

def predict_field_clustering(df_origin, df_clustering, kmeans, k=3, field='PCE (%)'):
    labels = kmeans.labels_
    centroids  = kmeans.cluster_centers_
    #centroid_labels = [centroids[i] for i in labels]
    #print(f'{labels} {centroids}')
    
    missing = get_missing(df_origin, field)
    #missing = missing_df.drop(['#', 'mat0', 'mat1', 'hopt (%)', 'PCE (%)']).to_numpy()

    # for each missing point, find the closest cluster
    rv = []
    for row in missing:
        closest_cluster = find_nearest(centroids, row)
        # select the corresponding cluster
        mask = (labels == closest_cluster)
        df_k = df_clustering.filter(mask)
        temp_id = df_k['#'].to_list()
        df_origin_k = df_origin.filter(pl.col('#').is_in(temp_id))
        idxs = find_nearest_df(df_k, df_origin_k, row, k, field)
        filter_df = df_origin.filter(pl.col('#').is_in(idxs))
        # compute the average
        pce_k = filter_df[field].to_numpy()
        rv.append((np.mean(pce_k), np.median(pce_k)))
    return rv

def get_clusters_df(df_origin, df_clustering, labels):
    min_k = min(labels)
    max_k = max(labels)

    df_per_cluster = []

    for k in range(min_k, max_k+1):
        mask = (labels == k)
        #print(f'k={k} = {mask} {mask.shape} {df.shape}')
        df_k = df_clustering.filter(mask)
        #print(f'{df_k}')
        cluster_labels = df_k['#'].to_list()
        #print(f'{cluster_labels}')
        filter_df = df_origin.filter(pl.col('#').is_in(cluster_labels))
        #print(filter_df)
        #mean_field = filter_df[field].mean()
        #field_per_cluster.append(mean_field)
        df_per_cluster.append(filter_df)
    return df_per_cluster


def rmse(measured, truth):
    return np.linalg.norm(measured - truth) / np.sqrt(len(truth))
In [ ]:
k=5
kmeans = KMeans(n_clusters=k, max_iter=1000, n_init='auto', init='k-means++', random_state=5).fit(X)
labels = kmeans.labels_
In [ ]:
avg_pce_cluster = compute_avg_field_per_cluster(df, df_clustering_index, labels, field='PCE (%)')
print(f'Avg PCE cluster: {avg_pce_cluster}')
avg_hopt_cluster = compute_avg_field_per_cluster(df, df_clustering_index, labels, field='hopt (%)')
print(f'Avg hOPT cluster: {avg_hopt_cluster}')
predictions_pce = predict_field_clustering(df, df_clustering_index, kmeans, k=3, field='PCE (%)')
print(f'Predictions PCE: {predictions_pce}')
predictions_hopt = predict_field_clustering(df, df_clustering_index, kmeans, k=3, field='hopt (%)')
print(f'Predictions hOPT: {predictions_hopt}')
Avg PCE cluster: [1.7782352941176471, 2.208725, 0.9611066247880435, 2.3942907096774193, 0.5521064117647059]
Avg hOPT cluster: [3.122, 9.250588235294117, 7.5371304347826085, 4.159564102564102, 5.469478260869566]
Predictions PCE: [(2.466666666666667, 2.9), (0.8433333333333334, 0.2), (1.9866666666666666, 1.96), (2.546666666666667, 2.9), (1.4166666666666667, 0.61), (0.7966666666666667, 0.61), (1.9217333333333333, 1.96), (0.07253333333333332, 0.0052), (1.96, 1.7), (5.4433333333333325, 4.29), (0.86, 0.2), (3.7266666666666666, 1.7), (2.63, 3.05), (2.9633333333333334, 2.94), (3.296666666666667, 3.1), (0.47333333333333333, 0.44), (0.47333333333333333, 0.44), (1.78, 1.7), (2.63, 3.05), (2.796666666666667, 2.39), (2.0033333333333334, 2.39), (1.0923333333333334, 0.77), (2.0366666666666666, 2.6), (0.11266666666666668, 0.117), (0.42333333333333334, 0.38), (2.42, 2.39), (1.7199666666666669, 2.3), (2.4, 1.96), (2.1672666666666665, 2.7), (1.7199666666666669, 2.3), (2.4, 1.96), (2.58, 1.94), (0.0011200000000000001, 0.00086), (0.015533333333333335, 0.0019), (2.2699999999999996, 1.94), (0.09355633333333334, 0.000471), (0.07253333333333332, 0.0052), (0.10832000000000001, 0.0441), (0.7966666666666667, 0.61), (0.09362, 0.08), (0.07253333333333332, 0.0052), (0.7966666666666667, 0.61), (1.21, 0.62), (0.8099666666666666, 0.08), (1.3350666666666668, 1.2), (1.23, 0.62), (0.8692000000000001, 0.0052), (1.8933333333333335, 2.6), (1.6499999999999997, 1.44), (1.5099999999999998, 1.13), (1.39, 1.7), (4.5566666666666675, 2.16), (2.466666666666667, 2.9), (1.1633333333333333, 0.62), (1.7666666666666666, 1.7), (1.623333333333333, 1.8), (0.779, 0.08), (1.3666666666666665, 1.7), (1.0566666666666666, 0.77), (2.23, 1.7), (4.140000000000001, 4.36)]
Predictions hOPT: [(4.0, 3.9), (2.56, 2.58), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (11.866666666666667, 12.5), (4.566666666666666, 6.4), (4.366666666666667, 3.2), (34.669999999999995, 48.0), (4.566666666666666, 6.4), (5.433333333333333, 6.8), (7.099999999999999, 6.8), (4.653333333333333, 4.56), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (2.3066666666666666, 1.75), (2.3066666666666666, 1.75), (1.1333333333333333, 1.4), (1.1333333333333333, 1.4), (3.233333333333333, 1.5), (3.7999999999999994, 3.3), (1.7266666666666666, 2.1), (1.53, 0.28), (1.553333333333333, 0.34), (3.86, 2.3), (1.4000000000000001, 1.2), (3.393333333333333, 2.1), (13.633333333333335, 2.7), (19.066666666666666, 19.0), (25.166666666666668, 19.0), (13.633333333333335, 2.7), (17.400000000000002, 3.2), (1.7166666666666668, 1.85), (3.2266666666666666, 1.6), (11.523333333333333, 4.56), (0.5976666666666667, 0.058), (5.8, 4.8), (2.8200000000000003, 3.51), (3.98, 3.2), (2.1833333333333336, 1.6), (3.5533333333333332, 3.2), (3.5533333333333332, 3.2), (3.61, 3.51)]
In [ ]:
mean_k_preds_pce = [p[0] for p in predictions_pce]
median_k_preds_pce = [p[1] for p in predictions_pce]

mean_k_preds_hopt = [p[0] for p in predictions_hopt]
median_k_preds_hopt = [p[1] for p in predictions_hopt]
In [ ]:
print(f'PCE')
missing = get_missing(df, field='PCE (%)')
for name, _, _ in models:
    print(f'Model: {name}')
    preds_pce     = models_02_pce[name].predict(missing)
    out_preds_pce = models_out_02_pce[name].predict(missing)

    print(f'Mean')
    diff_mean_preds = np.abs(preds_pce-mean_k_preds_pce)
    diff_mean_out_preds = np.abs(out_preds_pce-mean_k_preds_pce)

    ax = sns.violinplot(data=[diff_mean_preds, diff_mean_out_preds])
    ax.set_xticklabels(['Baseline', 'No Outliers'])
    fig = plt.gcf()
    fig.savefig(f'figures/cluster_pce_{name}_mean.pdf', bbox_inches='tight')
    plt.show()
    #print(f'RMSE {rmse(preds_pce, mean_k_preds_pce)} {rmse(out_preds_pce, mean_k_preds_pce)}')

    print(f'Median')
    diff_median_preds = np.abs(preds_pce-median_k_preds_pce)
    diff_median_out_preds = np.abs(out_preds_pce-median_k_preds_pce)

    ax = sns.violinplot(data=[diff_median_preds, diff_median_out_preds])
    ax.set_xticklabels(['Baseline', 'No Outliers'])
    fig = plt.gcf()
    fig.savefig(f'figures/cluster_pce_{name}_median.pdf', bbox_inches='tight')
    plt.show()
    #print(f'RMSE {rmse(preds_pce, median_k_preds_pce)} {rmse(out_preds_pce, median_k_preds_pce)}')

    score_mae_mean = mean_absolute_error(preds_pce, mean_k_preds_pce)
    score_mse_mean = mean_squared_error(preds_pce, mean_k_preds_pce)
    score_r2_mean  = r2_score(preds_pce, mean_k_preds_pce)
                
    score_mae_median = mean_absolute_error(preds_pce, median_k_preds_pce)
    score_mse_median = mean_squared_error(preds_pce, median_k_preds_pce)
    score_r2_median  = r2_score(preds_pce, median_k_preds_pce)

    # print the results
    print(f'Baseline')
    print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    print(f'| {"Mean":<17} | {round(score_mae_mean, 2):>6} | {round(score_mse_mean, 2):>6} | {round(score_r2_mean, 2):>6} |')
    print(f'| {"Median":<17} | {round(score_mae_median, 2):>6} | {round(score_mse_median, 2):>6} | {round(score_r2_median, 2):>6} |')
    print()

    score_mae_mean_out = mean_absolute_error(out_preds_pce, mean_k_preds_pce)
    score_mse_mean_out = mean_squared_error(out_preds_pce, mean_k_preds_pce)
    score_r2_mean_out  = r2_score(mean_k_preds_pce, out_preds_pce)
                
    score_mae_median_out = mean_absolute_error(out_preds_pce, median_k_preds_pce)
    score_mse_median_out = mean_squared_error(out_preds_pce, median_k_preds_pce)
    score_r2_median_out  = r2_score(median_k_preds_pce, out_preds_pce)

    # print the results
    print(f'No Outliers')
    print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    print(f'| {"Mean":<17} | {round(score_mae_mean_out, 2):>6} | {round(score_mse_mean_out, 2):>6} | {round(score_r2_mean_out, 2):>6} |')
    print(f'| {"Median":<17} | {round(score_mae_median_out, 2):>6} | {round(score_mse_median_out, 2):>6} | {round(score_r2_median_out, 2):>6} |')
    print()
PCE
Model: Linear Regression
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.97 |   1.47 |  -1.87 |
| Median            |   1.02 |   1.48 |  -1.89 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.84 |   1.29 |   0.03 |
| Median            |   0.89 |   1.19 |   0.09 |

Model: K Neighbors
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |    0.8 |   1.35 |   -0.2 |
| Median            |   0.84 |   1.36 |  -0.21 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.68 |   0.99 |   0.26 |
| Median            |   0.74 |   0.81 |   0.38 |

Model: Random Forest
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.86 |   1.13 |  -1.23 |
| Median            |   0.91 |   1.24 |  -1.45 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.79 |   1.24 |   0.07 |
| Median            |   0.87 |    1.1 |   0.16 |

Model: Gradient Boosting
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   1.18 |   2.86 |  -0.13 |
| Median            |   1.21 |   2.78 |   -0.1 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.99 |    1.8 |  -0.35 |
| Median            |   1.03 |   1.58 |   -0.2 |

Model: XGBoost
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.83 |   1.07 |  -3.07 |
| Median            |    0.9 |   1.04 |  -2.97 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   0.86 |   1.46 |   -0.1 |
| Median            |   0.89 |   1.23 |   0.07 |

In [ ]:
print(f'hOPT')
missing = get_missing(df, field='hopt (%)')
for name, _, _ in models:
    print(f'Model: {name}')
    preds_hopt     = models_02_hopt[name].predict(missing)
    out_preds_hopt = models_out_02_hopt[name].predict(missing)
    
    print(f'Mean')
    diff_mean_preds = np.abs(preds_hopt-mean_k_preds_hopt)
    diff_mean_out_preds = np.abs(out_preds_hopt-mean_k_preds_hopt)

    ax = sns.violinplot(data=[diff_mean_preds, diff_mean_out_preds])
    ax.set_xticklabels(['Baseline', 'No Outliers'])
    fig = plt.gcf()
    fig.savefig(f'figures/cluster_hopt_{name}_mean.pdf', bbox_inches='tight')
    plt.show()
    #print(f'RMSE {rmse(preds_pce, mean_k_preds_pce)} {rmse(out_preds_pce, mean_k_preds_pce)}')

    print(f'Median')
    diff_median_preds = np.abs(preds_hopt-median_k_preds_hopt)
    diff_median_out_preds = np.abs(out_preds_hopt-median_k_preds_hopt)

    ax = sns.violinplot(data=[diff_median_preds, diff_median_out_preds])
    ax.set_xticklabels(['Baseline', 'No Outliers'])
    fig = plt.gcf()
    fig.savefig(f'figures/cluster_hopt_{name}_median.pdf', bbox_inches='tight')
    plt.show()
    #print(f'RMSE {rmse(preds_pce, median_k_preds_pce)} {rmse(out_preds_pce, median_k_preds_pce)}')

    score_mae_mean = mean_absolute_error(preds_hopt, mean_k_preds_hopt)
    score_mse_mean = mean_squared_error(preds_hopt, mean_k_preds_hopt)
    score_r2_mean  = r2_score(preds_hopt, mean_k_preds_hopt)
                
    score_mae_median = mean_absolute_error(preds_hopt, median_k_preds_hopt)
    score_mse_median = mean_squared_error(preds_hopt, median_k_preds_hopt)
    score_r2_median  = r2_score(preds_hopt, median_k_preds_hopt)

    # print the results
    print(f'Baseline')
    print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    print(f'| {"Mean":<17} | {round(score_mae_mean, 2):>6} | {round(score_mse_mean, 2):>6} | {round(score_r2_mean, 2):>6} |')
    print(f'| {"Median":<17} | {round(score_mae_median, 2):>6} | {round(score_mse_median, 2):>6} | {round(score_r2_median, 2):>6} |')
    print()

    score_mae_mean_out = mean_absolute_error(out_preds_hopt, mean_k_preds_hopt)
    score_mse_mean_out = mean_squared_error(out_preds_hopt, mean_k_preds_hopt)
    score_r2_mean_out  = r2_score(mean_k_preds_hopt, out_preds_hopt)
                
    score_mae_median_out = mean_absolute_error(out_preds_hopt, median_k_preds_hopt)
    score_mse_median_out = mean_squared_error(out_preds_hopt, median_k_preds_hopt)
    score_r2_median_out  = r2_score(median_k_preds_hopt, out_preds_hopt)

    # print the results
    print(f'No Outliers')
    print(f'| {"Aggregation":^17} | {"MAE":^6} | {"MSE":^6} | {"R2":^6} |')
    print(f'| ----------------- | ------ | ------ | ------ |')
    print(f'| {"Mean":<17} | {round(score_mae_mean_out, 2):>6} | {round(score_mse_mean_out, 2):>6} | {round(score_r2_mean_out, 2):>6} |')
    print(f'| {"Median":<17} | {round(score_mae_median_out, 2):>6} | {round(score_mse_median_out, 2):>6} | {round(score_r2_median_out, 2):>6} |')
    print()
hOPT
Model: Linear Regression
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   4.99 |  47.15 |  -2.12 |
| Median            |   4.79 |  57.43 |   -2.8 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.84 |  56.54 |  -0.18 |
| Median            |   3.07 |  63.06 |  -0.06 |

Model: K Neighbors
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   2.65 |  23.39 |   0.49 |
| Median            |   1.98 |  15.15 |   0.67 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.67 |   57.3 |  -0.19 |
| Median            |   2.77 |  62.26 |  -0.04 |

Model: Random Forest
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.52 |  39.02 |   0.46 |
| Median            |   2.41 |  21.55 |    0.7 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.59 |  58.64 |  -0.22 |
| Median            |   2.87 |  64.29 |  -0.08 |

Model: Gradient Boosting
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.42 |  36.85 |   0.43 |
| Median            |   2.09 |  18.12 |   0.72 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.92 |  61.96 |  -0.29 |
| Median            |    3.2 |  67.39 |  -0.13 |

Model: XGBoost
Mean
Median
Baseline
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   2.95 |  28.03 |  -0.29 |
| Median            |   2.31 |  19.48 |    0.1 |

No Outliers
|    Aggregation    |  MAE   |  MSE   |   R2   |
| ----------------- | ------ | ------ | ------ |
| Mean              |   3.79 |  59.47 |  -0.24 |
| Median            |   2.93 |   64.6 |  -0.08 |

In [ ]:
df_per_cluster = get_clusters_df(df, df_clustering_index, labels)

for i, df in enumerate(df_per_cluster):
    print(f'Cluster {i}')
    print(f'{df}')
Cluster 0
shape: (25, 12)
┌─────┬──────┬───────────┬──────────┬───┬────────┬────────┬──────────┬─────────┐
│ #   ┆ mat0 ┆ mat1      ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │
│ --- ┆ ---  ┆ ---       ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---      ┆ ---     │
│ i64 ┆ str  ┆ str       ┆ i64      ┆   ┆ i64    ┆ f64    ┆ f64      ┆ f64     │
╞═════╪══════╪═══════════╪══════════╪═══╪════════╪════════╪══════════╪═════════╡
│ 8   ┆ QD   ┆ solution  ┆ 800      ┆ … ┆ 1000   ┆ 30.0   ┆ 1.4      ┆ 3.2     │
│ 14  ┆ dye  ┆ solution  ┆ 466      ┆ … ┆ 900    ┆ 11.0   ┆ 0.6      ┆ null    │
│ 15  ┆ dye  ┆ solution  ┆ 557      ┆ … ┆ 900    ┆ 9.0    ┆ 0.5      ┆ null    │
│ 26  ┆ dye  ┆ film      ┆ 745      ┆ … ┆ 900    ┆ 25.0   ┆ null     ┆ 0.61    │
│ 27  ┆ dye  ┆ film      ┆ 745      ┆ … ┆ 900    ┆ 25.0   ┆ null     ┆ 1.24    │
│ 28  ┆ dye  ┆ film      ┆ 745      ┆ … ┆ 900    ┆ 25.0   ┆ null     ┆ 0.54    │
│ 29  ┆ dye  ┆ film      ┆ 745      ┆ … ┆ 900    ┆ 25.0   ┆ null     ┆ 1.41    │
│ 34  ┆ NP   ┆ bulk      ┆ 550      ┆ … ┆ 1000   ┆ 80.0   ┆ 6.8      ┆ null    │
│ 36  ┆ QD   ┆ film      ┆ 450      ┆ … ┆ 975    ┆ 40.0   ┆ null     ┆ null    │
│ 37  ┆ QD   ┆ waveguide ┆ 650      ┆ … ┆ 850    ┆ 63.0   ┆ 1.75     ┆ null    │
│ 38  ┆ QD   ┆ solution  ┆ 650      ┆ … ┆ 850    ┆ 63.0   ┆ 3.67     ┆ null    │
│ 55  ┆ QD   ┆ bulk      ┆ 500      ┆ … ┆ 1000   ┆ 50.0   ┆ null     ┆ 2.85    │
│ 57  ┆ QD   ┆ film      ┆ 450      ┆ … ┆ 1100   ┆ 91.0   ┆ 8.1      ┆ 2.94    │
│ 61  ┆ QD   ┆ bulk      ┆ 500      ┆ … ┆ 1000   ┆ 78.0   ┆ 6.4      ┆ 3.1     │
│ 62  ┆ QD   ┆ bulk      ┆ 640      ┆ … ┆ 1200   ┆ 40.0   ┆ null     ┆ 3.27    │
│ 63  ┆ QD   ┆ bulk      ┆ 415      ┆ … ┆ 1240   ┆ 60.3   ┆ null     ┆ 3.94    │
│ 72  ┆ dye  ┆ film      ┆ 760      ┆ … ┆ 900    ┆ 24.0   ┆ null     ┆ 0.44    │
│ 73  ┆ dye  ┆ film      ┆ 760      ┆ … ┆ 900    ┆ 24.0   ┆ null     ┆ 0.28    │
│ 74  ┆ dye  ┆ film      ┆ 700      ┆ … ┆ 900    ┆ 30.0   ┆ null     ┆ 0.62    │
│ 75  ┆ dye  ┆ film      ┆ 700      ┆ … ┆ 900    ┆ 30.0   ┆ null     ┆ 0.36    │
│ 76  ┆ dye  ┆ film      ┆ 738      ┆ … ┆ 900    ┆ 23.0   ┆ null     ┆ 0.41    │
│ 77  ┆ dye  ┆ film      ┆ 738      ┆ … ┆ 900    ┆ 23.0   ┆ null     ┆ 0.28    │
│ 79  ┆ QD   ┆ bulk      ┆ 763      ┆ … ┆ 1050   ┆ 70.0   ┆ null     ┆ 4.74    │
│ 106 ┆ dye  ┆ film      ┆ 770      ┆ … ┆ 950    ┆ 16.0   ┆ 1.5      ┆ null    │
│ 110 ┆ dye  ┆ fiber     ┆ 780      ┆ … ┆ 850    ┆ 21.0   ┆ 0.5      ┆ null    │
└─────┴──────┴───────────┴──────────┴───┴────────┴────────┴──────────┴─────────┘
Cluster 1
shape: (46, 12)
┌─────┬─────────┬──────────┬──────────┬───┬────────┬────────┬──────────┬─────────┐
│ #   ┆ mat0    ┆ mat1     ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%) │
│ --- ┆ ---     ┆ ---      ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---      ┆ ---     │
│ i64 ┆ str     ┆ str      ┆ i64      ┆   ┆ i64    ┆ f64    ┆ f64      ┆ f64     │
╞═════╪═════════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪═════════╡
│ 1   ┆ dye     ┆ film     ┆ 578      ┆ … ┆ 750    ┆ 98.0   ┆ 18.8     ┆ null    │
│ 9   ┆ QD      ┆ solution ┆ 600      ┆ … ┆ 700    ┆ 50.0   ┆ 0.5      ┆ 1.2     │
│ 11  ┆ QD      ┆ solution ┆ 376      ┆ … ┆ 700    ┆ 50.0   ┆ 0.3      ┆ null    │
│ 12  ┆ dye     ┆ solution ┆ 413      ┆ … ┆ 800    ┆ 67.0   ┆ 3.4      ┆ null    │
│ 13  ┆ dye     ┆ solution ┆ 550      ┆ … ┆ 750    ┆ 95.0   ┆ 2.6      ┆ null    │
│ 16  ┆ polymer ┆ solution ┆ 460      ┆ … ┆ 800    ┆ 45.0   ┆ 1.0      ┆ null    │
│ 17  ┆ polymer ┆ solution ┆ 467      ┆ … ┆ 700    ┆ 48.0   ┆ 0.9      ┆ null    │
│ 25  ┆ dye     ┆ bulk     ┆ 580      ┆ … ┆ 800    ┆ 100.0  ┆ null     ┆ 0.0018  │
│ 30  ┆ dye     ┆ bulk     ┆ 525      ┆ … ┆ 800    ┆ 97.6   ┆ null     ┆ 2.6     │
│ 51  ┆ NP      ┆ film     ┆ 500      ┆ … ┆ 600    ┆ 92.0   ┆ null     ┆ null    │
│ 56  ┆ QD      ┆ bulk     ┆ 500      ┆ … ┆ 700    ┆ 35.9   ┆ 1.45     ┆ null    │
│ 68  ┆ dye     ┆ bulk     ┆ 478      ┆ … ┆ 800    ┆ 61.1   ┆ 22.0     ┆ null    │
│ 69  ┆ dye     ┆ bulk     ┆ 513      ┆ … ┆ 800    ┆ 24.8   ┆ 3.3      ┆ null    │
│ 71  ┆ dye     ┆ bulk     ┆ 473      ┆ … ┆ 800    ┆ 44.3   ┆ 24.7     ┆ null    │
│ 84  ┆ Ln      ┆ bulk     ┆ 590      ┆ … ┆ 700    ┆ 65.0   ┆ null     ┆ 11.3    │
│ 98  ┆ dye     ┆ film     ┆ 520      ┆ … ┆ 700    ┆ 78.0   ┆ 0.08     ┆ null    │
│ 99  ┆ dye     ┆ fiber    ┆ 520      ┆ … ┆ 700    ┆ 78.0   ┆ 1.6      ┆ 0.0052  │
│ 100 ┆ dye     ┆ fiber    ┆ 520      ┆ … ┆ 700    ┆ 93.0   ┆ 8.0      ┆ 0.0024  │
│ 105 ┆ dye     ┆ bulk     ┆ 570      ┆ … ┆ 750    ┆ 80.0   ┆ 14.5     ┆ 2.16    │
│ 107 ┆ dye     ┆ bulk     ┆ 665      ┆ … ┆ 750    ┆ 12.0   ┆ 3.7      ┆ 0.1     │
│ 109 ┆ dye     ┆ fiber    ┆ 560      ┆ … ┆ 700    ┆ 95.0   ┆ 2.1      ┆ null    │
│ 112 ┆ dye     ┆ fiber    ┆ 560      ┆ … ┆ 700    ┆ 95.0   ┆ null     ┆ 0.21    │
│ 113 ┆ QD      ┆ bulk     ┆ 580      ┆ … ┆ 700    ┆ 10.0   ┆ null     ┆ 2.1     │
│ 114 ┆ QD      ┆ bulk     ┆ 580      ┆ … ┆ 700    ┆ 60.0   ┆ null     ┆ 2.1     │
│ 115 ┆ dye     ┆ bulk     ┆ 575      ┆ … ┆ 720    ┆ 95.0   ┆ null     ┆ 3.3     │
│ 116 ┆ QD      ┆ bulk     ┆ 580      ┆ … ┆ 700    ┆ 10.0   ┆ null     ┆ 2.1     │
│ 117 ┆ QD      ┆ bulk     ┆ 450      ┆ … ┆ 680    ┆ 44.0   ┆ null     ┆ 2.8     │
│ 118 ┆ QD      ┆ bulk     ┆ 450      ┆ … ┆ 650    ┆ 86.0   ┆ 48.0     ┆ null    │
│ 121 ┆ Ln      ┆ bulk     ┆ 530      ┆ … ┆ 750    ┆ 83.0   ┆ null     ┆ 1.44    │
│ 122 ┆ QD      ┆ bulk     ┆ 473      ┆ … ┆ 700    ┆ 45.0   ┆ 1.0      ┆ null    │
│ 124 ┆ QD      ┆ bulk     ┆ 470      ┆ … ┆ 700    ┆ 20.0   ┆ 2.01     ┆ null    │
│ 125 ┆ dye     ┆ bulk     ┆ 521      ┆ … ┆ 700    ┆ 93.0   ┆ 54.0     ┆ null    │
│ 128 ┆ dye     ┆ film     ┆ 545      ┆ … ┆ 800    ┆ 50.0   ┆ 12.5     ┆ null    │
│ 129 ┆ dye     ┆ solution ┆ 498      ┆ … ┆ 700    ┆ 30.0   ┆ 6.88     ┆ 0.27    │
│ 130 ┆ dye     ┆ solution ┆ 569      ┆ … ┆ 750    ┆ 61.0   ┆ 2.58     ┆ null    │
│ 134 ┆ CD      ┆ bulk     ┆ 720      ┆ … ┆ 600    ┆ 65.0   ┆ null     ┆ 8.75    │
│ 136 ┆ CD      ┆ film     ┆ 557      ┆ … ┆ 700    ┆ 70.0   ┆ 2.3      ┆ null    │
│ 138 ┆ dye     ┆ bulk     ┆ 575      ┆ … ┆ 700    ┆ 96.0   ┆ 37.7     ┆ null    │
│ 142 ┆ dye     ┆ bulk     ┆ 575      ┆ … ┆ 720    ┆ 95.0   ┆ 19.0     ┆ 2.9     │
│ 143 ┆ QD      ┆ bulk     ┆ 480      ┆ … ┆ 700    ┆ 15.7   ┆ 3.2      ┆ 0.62    │
│ 146 ┆ QDdye   ┆ bulk     ┆ 500      ┆ … ┆ 750    ┆ 32.7   ┆ 1.0      ┆ null    │
│ 164 ┆ CD      ┆ film     ┆ 555      ┆ … ┆ 800    ┆ 17.6   ┆ 3.0      ┆ 2.7     │
│ 165 ┆ CD      ┆ film     ┆ 450      ┆ … ┆ 800    ┆ 60.0   ┆ 4.3      ┆ 3.8     │
│ 171 ┆ CD      ┆ film     ┆ 410      ┆ … ┆ 750    ┆ 7.6    ┆ 2.77     ┆ 1.96    │
│ 173 ┆ QD      ┆ bulk     ┆ 600      ┆ … ┆ 700    ┆ 30.0   ┆ 2.7      ┆ 0.38    │
│ 200 ┆ dye     ┆ solution ┆ 660      ┆ … ┆ 800    ┆ 31.0   ┆ 2.65     ┆ 0.21    │
└─────┴─────────┴──────────┴──────────┴───┴────────┴────────┴──────────┴─────────┘
Cluster 2
shape: (23, 12)
┌─────┬─────────┬──────────┬──────────┬───┬────────┬────────┬──────────┬──────────┐
│ #   ┆ mat0    ┆ mat1     ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%)  │
│ --- ┆ ---     ┆ ---      ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---      ┆ ---      │
│ i64 ┆ str     ┆ str      ┆ i64      ┆   ┆ i64    ┆ f64    ┆ f64      ┆ f64      │
╞═════╪═════════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡
│ 177 ┆ Ln      ┆ film     ┆ 450      ┆ … ┆ 570    ┆ 44.0   ┆ 4.8      ┆ 0.054    │
│ 178 ┆ Ln      ┆ film     ┆ 400      ┆ … ┆ 470    ┆ 37.0   ┆ 7.7      ┆ 0.058    │
│ 179 ┆ CD      ┆ film     ┆ 425      ┆ … ┆ 400    ┆ 11.0   ┆ 13.1     ┆ 0.053    │
│ 180 ┆ Ln      ┆ film     ┆ 450      ┆ … ┆ 570    ┆ 44.0   ┆ 5.2      ┆ 0.046    │
│ 181 ┆ Ln      ┆ film     ┆ 400      ┆ … ┆ 470    ┆ 37.0   ┆ 7.7      ┆ 0.047    │
│ 182 ┆ CD      ┆ film     ┆ 425      ┆ … ┆ 400    ┆ 11.0   ┆ 12.8     ┆ 0.041    │
│ 183 ┆ Ln      ┆ film     ┆ 450      ┆ … ┆ 570    ┆ 59.0   ┆ 6.7      ┆ 0.074    │
│ 184 ┆ Ln      ┆ film     ┆ 400      ┆ … ┆ 470    ┆ 54.0   ┆ 8.5      ┆ 0.065    │
│ 185 ┆ Ln      ┆ film     ┆ 450      ┆ … ┆ 570    ┆ 59.0   ┆ 10.7     ┆ 0.096    │
│ 186 ┆ Ln      ┆ film     ┆ 400      ┆ … ┆ 470    ┆ 54.0   ┆ 8.7      ┆ 0.053    │
│ 187 ┆ Ln      ┆ film     ┆ 450      ┆ … ┆ 570    ┆ 59.0   ┆ 11.7     ┆ 0.142    │
│ 188 ┆ Ln      ┆ film     ┆ 400      ┆ … ┆ 470    ┆ 54.0   ┆ 16.5     ┆ 0.136    │
│ 189 ┆ dye     ┆ film     ┆ 650      ┆ … ┆ 600    ┆ 4.0    ┆ 2.6      ┆ 0.044    │
│ 190 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 400    ┆ 47.0   ┆ 5.71     ┆ 2.29     │
│ 191 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 450    ┆ 57.0   ┆ 9.112    ┆ 2.32     │
│ 192 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 400    ┆ 68.0   ┆ 12.08    ┆ 2.47     │
│ 193 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 400    ┆ 47.0   ┆ 5.71     ┆ 4.38     │
│ 194 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 450    ┆ 57.0   ┆ 9.112    ┆ 4.62     │
│ 195 ┆ polymer ┆ film     ┆ 500      ┆ … ┆ 400    ┆ 68.0   ┆ 12.08    ┆ 4.92     │
│ 196 ┆ NP      ┆ bulk     ┆ 600      ┆ … ┆ 490    ┆ 65.0   ┆ 0.15     ┆ 0.049413 │
│ 197 ┆ dye     ┆ bulk     ┆ 600      ┆ … ┆ 575    ┆ 70.0   ┆ 0.16     ┆ 0.050786 │
│ 198 ┆ NPdye   ┆ bulk     ┆ 600      ┆ … ┆ 500    ┆ 68.0   ┆ 0.22     ┆ 0.07531  │
│ 199 ┆ QD      ┆ solution ┆ 510      ┆ … ┆ 475    ┆ 89.0   ┆ 2.32     ┆ 0.020944 │
└─────┴─────────┴──────────┴──────────┴───┴────────┴────────┴──────────┴──────────┘
Cluster 3
shape: (51, 12)
┌─────┬──────┬──────────┬──────────┬───┬────────┬────────┬──────────┬──────────┐
│ #   ┆ mat0 ┆ mat1     ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%)  │
│ --- ┆ ---  ┆ ---      ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---      ┆ ---      │
│ i64 ┆ str  ┆ str      ┆ i64      ┆   ┆ i64    ┆ f64    ┆ f64      ┆ f64      │
╞═════╪══════╪══════════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡
│ 18  ┆ dye  ┆ film     ┆ 403      ┆ … ┆ 600    ┆ 40.8   ┆ 7.7      ┆ null     │
│ 19  ┆ dye  ┆ bulk     ┆ 374      ┆ … ┆ 600    ┆ 100.0  ┆ null     ┆ null     │
│ 23  ┆ dye  ┆ bulk     ┆ 340      ┆ … ┆ 700    ┆ 14.0   ┆ 0.25     ┆ null     │
│ 24  ┆ dye  ┆ bulk     ┆ 340      ┆ … ┆ 600    ┆ 78.0   ┆ 0.4      ┆ null     │
│ 43  ┆ CD   ┆ film     ┆ 350      ┆ … ┆ 650    ┆ 45.0   ┆ 12.0     ┆ null     │
│ 44  ┆ CD   ┆ film     ┆ 358      ┆ … ┆ 600    ┆ 94.0   ┆ 3.9      ┆ null     │
│ 46  ┆ CD   ┆ bulk     ┆ 340      ┆ … ┆ 700    ┆ 40.0   ┆ 0.92     ┆ null     │
│ 47  ┆ CDQD ┆ tandem   ┆ 450      ┆ … ┆ 700    ┆ 45.0   ┆ 1.4      ┆ null     │
│ 49  ┆ CDQD ┆ bulk     ┆ 400      ┆ … ┆ 550    ┆ 70.0   ┆ null     ┆ 3.05     │
│ 52  ┆ NP   ┆ bulk     ┆ 325      ┆ … ┆ 525    ┆ 58.0   ┆ 2.4      ┆ 1.8      │
│ 54  ┆ CD   ┆ bulk     ┆ 440      ┆ … ┆ 700    ┆ 25.0   ┆ 1.2      ┆ null     │
│ 58  ┆ QD   ┆ bulk     ┆ 350      ┆ … ┆ 775    ┆ 81.0   ┆ 26.5     ┆ 8.71     │
│ 65  ┆ QD   ┆ bulk     ┆ 360      ┆ … ┆ 700    ┆ 53.0   ┆ null     ┆ null     │
│ 82  ┆ dye  ┆ bulk     ┆ 450      ┆ … ┆ 600    ┆ 17.0   ┆ null     ┆ 8.99     │
│ 88  ┆ Ln   ┆ film     ┆ 290      ┆ … ┆ 650    ┆ 40.0   ┆ 8.8      ┆ null     │
│ 91  ┆ Ln   ┆ film     ┆ 325      ┆ … ┆ 650    ┆ 12.0   ┆ 1.7      ┆ null     │
│ 123 ┆ QD   ┆ bulk     ┆ 350      ┆ … ┆ 750    ┆ 56.0   ┆ null     ┆ 8.71     │
│ 126 ┆ CD   ┆ film     ┆ 450      ┆ … ┆ 700    ┆ 11.0   ┆ 7.58     ┆ 6.0      │
│ 131 ┆ dye  ┆ solution ┆ 488      ┆ … ┆ 600    ┆ 51.0   ┆ 3.3      ┆ 0.35     │
│ 132 ┆ CD   ┆ bulk     ┆ 460      ┆ … ┆ 500    ┆ 54.0   ┆ 2.7      ┆ null     │
│ 133 ┆ CD   ┆ bulk     ┆ 340      ┆ … ┆ 700    ┆ 6.0    ┆ 5.84     ┆ null     │
│ 135 ┆ CD   ┆ film     ┆ 420      ┆ … ┆ 700    ┆ 40.0   ┆ 1.6      ┆ 0.7      │
│ 137 ┆ CD   ┆ film     ┆ 420      ┆ … ┆ 600    ┆ 67.0   ┆ 2.2      ┆ 1.13     │
│ 139 ┆ CD   ┆ solution ┆ 491      ┆ … ┆ 700    ┆ 82.0   ┆ 5.43     ┆ 0.18     │
│ 140 ┆ CD   ┆ film     ┆ 510      ┆ … ┆ 650    ┆ 78.0   ┆ 0.058    ┆ 0.00083  │
│ 141 ┆ CD   ┆ film     ┆ 510      ┆ … ┆ 650    ┆ 78.0   ┆ 1.7      ┆ 0.014    │
│ 144 ┆ CD   ┆ film     ┆ 405      ┆ … ┆ 700    ┆ 70.0   ┆ 3.2      ┆ 1.9      │
│ 145 ┆ CD   ┆ film     ┆ 405      ┆ … ┆ 700    ┆ 65.0   ┆ 2.9      ┆ 1.7      │
│ 147 ┆ CD   ┆ film     ┆ 347      ┆ … ┆ 700    ┆ 61.0   ┆ 4.56     ┆ 4.1      │
│ 148 ┆ Ln   ┆ bulk     ┆ 405      ┆ … ┆ 700    ┆ 81.0   ┆ 3.4      ┆ 1.37     │
│ 149 ┆ CD   ┆ film     ┆ 490      ┆ … ┆ 650    ┆ 80.5   ┆ null     ┆ 2.06     │
│ 150 ┆ CD   ┆ film     ┆ 490      ┆ … ┆ 650    ┆ 80.5   ┆ 4.8      ┆ 4.36     │
│ 151 ┆ CD   ┆ film     ┆ 380      ┆ … ┆ 700    ┆ 11.54  ┆ 1.36     ┆ null     │
│ 152 ┆ QD   ┆ film     ┆ 300      ┆ … ┆ 550    ┆ 35.91  ┆ 3.08     ┆ null     │
│ 154 ┆ CDQD ┆ film     ┆ 320      ┆ … ┆ 700    ┆ 23.0   ┆ 1.89     ┆ null     │
│ 155 ┆ CDQD ┆ film     ┆ 320      ┆ … ┆ 700    ┆ 22.0   ┆ 2.54     ┆ null     │
│ 156 ┆ CDQD ┆ film     ┆ 320      ┆ … ┆ 700    ┆ 26.0   ┆ 3.76     ┆ null     │
│ 157 ┆ CD   ┆ film     ┆ 350      ┆ … ┆ 750    ┆ 35.0   ┆ null     ┆ 1.9      │
│ 158 ┆ CD   ┆ film     ┆ 380      ┆ … ┆ 700    ┆ 35.0   ┆ null     ┆ 1.7      │
│ 159 ┆ CD   ┆ film     ┆ 370      ┆ … ┆ 775    ┆ 35.0   ┆ null     ┆ 2.3      │
│ 160 ┆ CD   ┆ film     ┆ 400      ┆ … ┆ 700    ┆ 33.0   ┆ 4.5      ┆ 0.117    │
│ 161 ┆ CD   ┆ bulk     ┆ 400      ┆ … ┆ 700    ┆ 41.0   ┆ 5.89     ┆ 0.16     │
│ 162 ┆ CD   ┆ bulk     ┆ 400      ┆ … ┆ 700    ┆ 41.0   ┆ 3.13     ┆ 0.061    │
│ 168 ┆ CD   ┆ bulk     ┆ 470      ┆ … ┆ 650    ┆ 9.6    ┆ 9.3      ┆ null     │
│ 169 ┆ CD   ┆ film     ┆ 380      ┆ … ┆ 700    ┆ 41.52  ┆ 3.51     ┆ 2.39     │
│ 170 ┆ CD   ┆ film     ┆ 355      ┆ … ┆ 650    ┆ 15.01  ┆ 2.76     ┆ 1.94     │
│ 172 ┆ CD   ┆ film     ┆ 400      ┆ … ┆ 750    ┆ 22.0   ┆ 4.03     ┆ 2.92     │
│ 174 ┆ QD   ┆ bulk     ┆ 350      ┆ … ┆ 700    ┆ 91.0   ┆ null     ┆ 4.29     │
│ 175 ┆ QD   ┆ bulk     ┆ 350      ┆ … ┆ 700    ┆ 91.0   ┆ null     ┆ 0.55     │
│ 176 ┆ QD   ┆ bulk     ┆ 350      ┆ … ┆ 700    ┆ 11.0   ┆ null     ┆ 0.77     │
│ 202 ┆ CD   ┆ film     ┆ 510      ┆ … ┆ 650    ┆ 78.0   ┆ 0.035    ┆ 0.000182 │
└─────┴──────┴──────────┴──────────┴───┴────────┴────────┴──────────┴──────────┘
Cluster 4
shape: (28, 12)
┌─────┬──────┬───────┬──────────┬───┬────────┬────────┬──────────┬──────────┐
│ #   ┆ mat0 ┆ mat1  ┆ abs_peak ┆ … ┆ em_max ┆ QY (%) ┆ hopt (%) ┆ PCE (%)  │
│ --- ┆ ---  ┆ ---   ┆ ---      ┆   ┆ ---    ┆ ---    ┆ ---      ┆ ---      │
│ i64 ┆ str  ┆ str   ┆ i64      ┆   ┆ i64    ┆ f64    ┆ f64      ┆ f64      │
╞═════╪══════╪═══════╪══════════╪═══╪════════╪════════╪══════════╪══════════╡
│ 20  ┆ dye  ┆ bulk  ┆ 370      ┆ … ┆ 750    ┆ 67.0   ┆ 5.5      ┆ null     │
│ 31  ┆ NP   ┆ bulk  ┆ 375      ┆ … ┆ 900    ┆ 45.0   ┆ 4.25     ┆ 1.33     │
│ 39  ┆ QD   ┆ film  ┆ 480      ┆ … ┆ 670    ┆ 36.2   ┆ 2.95     ┆ 2.25     │
│ 50  ┆ NP   ┆ film  ┆ 400      ┆ … ┆ 700    ┆ 25.0   ┆ 1.85     ┆ null     │
│ 67  ┆ dye  ┆ bulk  ┆ 446      ┆ … ┆ 800    ┆ 89.5   ┆ 31.3     ┆ null     │
│ 70  ┆ dye  ┆ bulk  ┆ 449      ┆ … ┆ 800    ┆ 80.0   ┆ 27.8     ┆ null     │
│ 80  ┆ dye  ┆ bulk  ┆ 491      ┆ … ┆ 650    ┆ 95.0   ┆ 23.7     ┆ 2.81     │
│ 85  ┆ Ln   ┆ film  ┆ 380      ┆ … ┆ 710    ┆ 30.5   ┆ 0.34     ┆ 0.0019   │
│ 86  ┆ Ln   ┆ film  ┆ 360      ┆ … ┆ 700    ┆ 1.6    ┆ 0.27     ┆ 0.00078  │
│ 87  ┆ Ln   ┆ film  ┆ 370      ┆ … ┆ 710    ┆ 27.0   ┆ 3.2      ┆ 0.007    │
│ 89  ┆ Ln   ┆ film  ┆ 350      ┆ … ┆ 710    ┆ 34.0   ┆ 4.3      ┆ null     │
│ 90  ┆ Ln   ┆ film  ┆ 325      ┆ … ┆ 710    ┆ 8.0    ┆ 1.2      ┆ null     │
│ 93  ┆ Ln   ┆ film  ┆ 370      ┆ … ┆ 710    ┆ 63.0   ┆ 9.0      ┆ null     │
│ 94  ┆ Ln   ┆ film  ┆ 370      ┆ … ┆ 710    ┆ 61.0   ┆ 1.2      ┆ 0.2      │
│ 95  ┆ Ln   ┆ film  ┆ 380      ┆ … ┆ 710    ┆ 23.0   ┆ 0.43     ┆ 0.03     │
│ 96  ┆ Ln   ┆ film  ┆ 360      ┆ … ┆ 710    ┆ 30.0   ┆ 0.01     ┆ 0.0006   │
│ 97  ┆ Ln   ┆ fiber ┆ 360      ┆ … ┆ 710    ┆ 85.0   ┆ 2.3      ┆ 0.00086  │
│ 101 ┆ Ln   ┆ film  ┆ 340      ┆ … ┆ 710    ┆ 44.0   ┆ null     ┆ 0.0441   │
│ 102 ┆ Ln   ┆ film  ┆ 405      ┆ … ┆ 710    ┆ 44.0   ┆ null     ┆ 0.0499   │
│ 103 ┆ Ln   ┆ film  ┆ 350      ┆ … ┆ 710    ┆ 73.0   ┆ 0.28     ┆ 0.28     │
│ 104 ┆ Ln   ┆ film  ┆ 320      ┆ … ┆ 710    ┆ 86.0   ┆ null     ┆ null     │
│ 108 ┆ Ln   ┆ fiber ┆ 370      ┆ … ┆ 710    ┆ 89.0   ┆ 0.7      ┆ null     │
│ 111 ┆ Ln   ┆ fiber ┆ 370      ┆ … ┆ 710    ┆ 89.0   ┆ null     ┆ 0.08     │
│ 120 ┆ QD   ┆ film  ┆ 396      ┆ … ┆ 700    ┆ 53.0   ┆ null     ┆ null     │
│ 153 ┆ QD   ┆ film  ┆ 320      ┆ … ┆ 700    ┆ 32.97  ┆ 2.55     ┆ null     │
│ 163 ┆ CD   ┆ film  ┆ 404      ┆ … ┆ 750    ┆ 86.4   ┆ 2.6      ┆ 2.3      │
│ 201 ┆ Ln   ┆ film  ┆ 370      ┆ … ┆ 720    ┆ 60.0   ┆ 0.02     ┆ 0.000198 │
│ 203 ┆ Ln   ┆ film  ┆ 370      ┆ … ┆ 720    ┆ 60.0   ┆ 0.048    ┆ 0.000471 │
└─────┴──────┴───────┴──────────┴───┴────────┴────────┴──────────┴──────────┘
In [ ]:
plot_stats_per_cluster(df, df_clustering_index, labels)
Cluster: 0
Cluster: 1
Cluster: 2
Cluster: 3
Cluster: 4

Previous version¶

In [ ]:
X = df[['abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']].to_numpy()
X[np.isnan(X)] = 0
X
Out[ ]:
array([[3.700e+02, 3.000e+02, 4.500e+02, 6.500e+02, 4.500e+02, 7.500e+02,
        6.700e+01, 5.500e+00, 0.000e+00],
       [3.750e+02, 3.000e+02, 4.500e+02, 7.500e+02, 4.000e+02, 9.000e+02,
        4.500e+01, 4.250e+00, 1.330e+00],
       [4.800e+02, 3.000e+02, 5.000e+02, 6.190e+02, 5.700e+02, 6.700e+02,
        3.620e+01, 2.950e+00, 2.250e+00],
       [4.000e+02, 3.000e+02, 5.000e+02, 6.000e+02, 5.000e+02, 7.000e+02,
        2.500e+01, 1.850e+00, 0.000e+00],
       [4.460e+02, 2.500e+02, 5.000e+02, 5.530e+02, 5.000e+02, 8.000e+02,
        8.950e+01, 3.130e+01, 0.000e+00],
       [4.490e+02, 2.500e+02, 5.500e+02, 5.710e+02, 5.000e+02, 8.000e+02,
        8.000e+01, 2.780e+01, 0.000e+00],
       [4.910e+02, 3.000e+02, 5.000e+02, 5.810e+02, 5.500e+02, 6.500e+02,
        9.500e+01, 2.370e+01, 2.810e+00],
       [3.800e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        3.050e+01, 3.400e-01, 1.900e-03],
       [3.600e+02, 2.500e+02, 3.800e+02, 5.450e+02, 4.500e+02, 7.000e+02,
        1.600e+00, 2.700e-01, 7.800e-04],
       [3.700e+02, 2.900e+02, 3.800e+02, 6.110e+02, 5.700e+02, 7.100e+02,
        2.700e+01, 3.200e+00, 7.000e-03],
       [3.500e+02, 2.400e+02, 4.200e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        3.400e+01, 4.300e+00, 0.000e+00],
       [3.250e+02, 2.400e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        8.000e+00, 1.200e+00, 0.000e+00],
       [3.700e+02, 2.400e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        6.300e+01, 9.000e+00, 0.000e+00],
       [3.700e+02, 3.000e+02, 3.800e+02, 6.150e+02, 5.700e+02, 7.100e+02,
        6.100e+01, 1.200e+00, 2.000e-01],
       [3.800e+02, 2.400e+02, 3.800e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        2.300e+01, 4.300e-01, 3.000e-02],
       [3.600e+02, 2.400e+02, 3.800e+02, 6.120e+02, 5.700e+02, 7.100e+02,
        3.000e+01, 1.000e-02, 6.000e-04],
       [3.600e+02, 2.400e+02, 4.200e+02, 6.150e+02, 5.700e+02, 7.100e+02,
        8.500e+01, 2.300e+00, 8.600e-04],
       [3.400e+02, 2.500e+02, 3.800e+02, 6.130e+02, 5.700e+02, 7.100e+02,
        4.400e+01, 0.000e+00, 4.410e-02],
       [4.050e+02, 2.500e+02, 4.200e+02, 6.130e+02, 5.700e+02, 7.100e+02,
        4.400e+01, 0.000e+00, 4.990e-02],
       [3.500e+02, 2.500e+02, 4.000e+02, 6.130e+02, 5.700e+02, 7.100e+02,
        7.300e+01, 2.800e-01, 2.800e-01],
       [3.200e+02, 2.500e+02, 3.500e+02, 6.130e+02, 5.700e+02, 7.100e+02,
        8.600e+01, 0.000e+00, 0.000e+00],
       [3.700e+02, 3.000e+02, 4.500e+02, 6.150e+02, 5.700e+02, 7.100e+02,
        8.900e+01, 7.000e-01, 0.000e+00],
       [3.700e+02, 3.000e+02, 4.500e+02, 6.150e+02, 5.700e+02, 7.100e+02,
        8.900e+01, 0.000e+00, 8.000e-02],
       [3.960e+02, 3.500e+02, 4.500e+02, 5.820e+02, 5.000e+02, 7.000e+02,
        5.300e+01, 0.000e+00, 0.000e+00],
       [3.200e+02, 3.000e+02, 5.500e+02, 6.700e+02, 6.000e+02, 7.000e+02,
        3.297e+01, 2.550e+00, 0.000e+00],
       [4.040e+02, 3.000e+02, 5.500e+02, 5.940e+02, 5.000e+02, 7.500e+02,
        8.640e+01, 2.600e+00, 2.300e+00],
       [3.700e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.200e+02,
        6.000e+01, 2.000e-02, 1.980e-04],
       [3.700e+02, 2.500e+02, 4.000e+02, 6.120e+02, 5.700e+02, 7.200e+02,
        6.000e+01, 4.800e-02, 4.710e-04]])
In [ ]:
# Get the Original Values
QY = X.T[6]
hopt = X.T[7]
PCE = X.T[8]

# Get the Masks for the non-zero values
QY_MASK = QY != 0
hopt_MASK = hopt != 0
PCE_MASK = PCE != 0

# Convert boolean mask to index one
QY_MASK = [i for i in range(len(QY_MASK)) if QY_MASK[i]]
hopt_MASK = [i for i in range(len(hopt_MASK)) if hopt_MASK[i]]
PCE_MASK = [i for i in range(len(PCE_MASK)) if PCE_MASK[i]]

# Compute the average and standard deviation
QY_AVG = np.average(QY[QY_MASK])
QY_STD = np.std(QY[QY_MASK])

hopt_AVG = np.average(hopt[hopt_MASK])
hopt_STD = np.std(hopt[hopt_MASK])

PCE_AVG = np.average(PCE[PCE_MASK])
PCE_STD = np.std(PCE[PCE_MASK])

test = .3
random.seed(42)

# Select the index of the test set
QY_TEST_SET_INDEX = random.sample(QY_MASK, int(len(QY_MASK)*test))
hopt_TEST_SET_INDEX = random.sample(hopt_MASK, int(len(hopt_MASK)*test))
PCE_TEST_SET_INDEX = random.sample(PCE_MASK, int(len(PCE_MASK)*test))

print(f'{QY_TEST_SET_INDEX}\n{hopt_TEST_SET_INDEX}\n{PCE_TEST_SET_INDEX}')
[20, 3, 0, 23, 8, 7, 22, 4]
[3, 26, 19, 2, 21, 13]
[2, 1, 27, 7, 22]
In [ ]:
# Store original values from the test set
QY_TEST_SET = QY[QY_TEST_SET_INDEX]
hopt_TEST_SET = hopt[hopt_TEST_SET_INDEX]
PCE_TEST_SET = PCE[PCE_TEST_SET_INDEX]

print(f'{QY_TEST_SET}\n{hopt_TEST_SET}\n{PCE_TEST_SET}')

# Replace the test set with zeros
QY[QY_TEST_SET_INDEX] = 0
hopt[hopt_TEST_SET_INDEX] = 0
PCE[PCE_TEST_SET_INDEX] = 0

print(f'{QY[QY_TEST_SET_INDEX]}\n{hopt[hopt_TEST_SET_INDEX]}\n{PCE[PCE_TEST_SET_INDEX]}')
[86.  25.  67.  53.   1.6 30.5 89.  89.5]
[1.85 0.02 0.28 2.95 0.7  1.2 ]
[2.25e+00 1.33e+00 4.71e-04 1.90e-03 8.00e-02]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
In [ ]:
# Write on the X Matrix again
X.T[6] = QY
X.T[7] = hopt
X.T[8] = PCE
In [ ]:
# Grid Search
ks = [2,3,4,5,6,7,8]
seeds = [53, 59, 61, 67, 71, 73, 79, 83, 89, 97]

# Store best solution (min cost)
min_cost = float('inf')
solution = None

for k in ks:
    for s in seeds:
        Xr, W, H, cost = nmf.nmf_mu(X, k=k, seed=s)
        if cost < min_cost:
            min_cost = cost
            solution = (k, s)
In [ ]:
print(f"{cost} {solution}")
49.7770480742326 (8, 59)
In [ ]:
# Measure the performance of the reconstruction
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])

# Get the predicted QY, hopt and PCE
QY = Xr.T[6]
hopt = Xr.T[7]
PCE = Xr.T[8]

QY_PREDICT_SET = QY[QY_TEST_SET_INDEX]
hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]

print(f'{QY_PREDICT_SET}\n{hopt_PREDICT_SET}\n{PCE_PREDICT_SET}')
[15.97727897 62.32648612 90.01184508 19.35966872 59.88768401 72.13344412
 18.33772585 46.85491737]
[12.57010428  4.919815    1.98392266 24.25932488  2.95390967  1.06247509]
[1.83368088 0.07531809 0.58336117 0.60496385 0.67724153]
In [ ]:
rmse_qy = math.sqrt(mean_squared_error(QY_TEST_SET, QY_PREDICT_SET))
rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))

print(f'RMSE(QY): {rmse_qy} ({QY_AVG} ± {QY_STD})')
print(f'RMSE(HOPT): {rmse_hopt} ({hopt_AVG} ± {hopt_STD})')
print(f'RMSE(PCE): {rmse_PCE} ({PCE_AVG} ± {PCE_STD})')
RMSE(QY): 49.875122512813185 (54.22035714285715 ± 26.58863969041543)
RMSE(HOPT): 10.008484102517194 (5.469478260869565 ± 8.892388483707826)
RMSE(PCE): 0.7493587531423203 (0.5521064117647059 ± 0.9382626018509556)

Grid Search (optimize with test set)¶

In [ ]:
# Grid Search
ks = [2,3,4,5,6,7,8]
seeds = [53, 59, 61, 67, 71, 73, 79, 83, 89, 97]

# Store best solution (min cost)
min_cost = float('inf')
solution = None

for k in ks:
    for s in seeds:
        Xr, _, _, _ = nmf.nmf_mu(X, k=k, seed=s)
        # Get the values with the mask
        hopt = Xr.T[7]
        PCE = Xr.T[8]
        hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
        PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]
        # Compute the cost of the test set
        rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
        rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))
        #cost = rmse_hopt + rmse_PCE
        #cost = rmse_hopt
        cost = rmse_PCE
        if cost < min_cost:
            min_cost = cost
            solution = (k, s)
In [ ]:
print(f"{cost} {solution}")
0.6433135652975813 (6, 83)
In [ ]:
# Measure the performance of the reconstruction
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])

# Get the predicted QY, hopt and PCE
QY = Xr.T[6]
hopt = Xr.T[7]
PCE = Xr.T[8]

QY_PREDICT_SET = QY[QY_TEST_SET_INDEX]
hopt_PREDICT_SET = hopt[hopt_TEST_SET_INDEX]
PCE_PREDICT_SET = PCE[PCE_TEST_SET_INDEX]

print(f'{QY_PREDICT_SET}\n{hopt_PREDICT_SET}\n{PCE_PREDICT_SET}')
[ 37.75873316  80.51076649  62.08184412 143.49604739  73.47495419
  43.23225725  94.97112549  94.22130926]
[11.68839014  5.07296792  4.07583527 19.44763593  5.26625266  4.70790051]
[1.9742166  2.1266637  0.3299664  0.51122061 0.54863358]
In [ ]:
rmse_qy = math.sqrt(mean_squared_error(QY_TEST_SET, QY_PREDICT_SET))
rmse_hopt = math.sqrt(mean_squared_error(hopt_TEST_SET, hopt_PREDICT_SET))
rmse_PCE = math.sqrt(mean_squared_error(PCE_TEST_SET, PCE_PREDICT_SET))

print(f'RMSE(QY): {rmse_qy} ({QY_AVG} ± {QY_STD})')
print(f'RMSE(HOPT): {rmse_hopt} ({hopt_AVG} ± {hopt_STD})')
print(f'RMSE(PCE): {rmse_PCE} ({PCE_AVG} ± {PCE_STD})')
RMSE(QY): 48.74481047927994 (54.22035714285715 ± 26.58863969041543)
RMSE(HOPT): 8.583538425748353 (5.469478260869565 ± 8.892388483707826)
RMSE(PCE): 0.5095726953252181 (0.5521064117647059 ± 0.9382626018509556)

Reconstruct the matrix¶

In [ ]:
# Get the original matrix
X = df[['abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']].to_numpy()
X[np.isnan(X)] = 0
print(X.shape)

# Get the reconstructed matrix
Xr, _, _, _ = nmf.nmf_mu(X, k=solution[0], seed=solution[1])

# Replace all zeros with reconstructed values
X[X==0] = Xr[X==0]
(28, 9)
In [ ]:
X
Out[ ]:
array([[3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.50000000e+02,
        4.50000000e+02, 7.50000000e+02, 6.70000000e+01, 5.50000000e+00,
        1.30557497e+00],
       [3.75000000e+02, 3.00000000e+02, 4.50000000e+02, 7.50000000e+02,
        4.00000000e+02, 9.00000000e+02, 4.50000000e+01, 4.25000000e+00,
        1.33000000e+00],
       [4.80000000e+02, 3.00000000e+02, 5.00000000e+02, 6.19000000e+02,
        5.70000000e+02, 6.70000000e+02, 3.62000000e+01, 2.95000000e+00,
        2.25000000e+00],
       [4.00000000e+02, 3.00000000e+02, 5.00000000e+02, 6.00000000e+02,
        5.00000000e+02, 7.00000000e+02, 2.50000000e+01, 1.85000000e+00,
        1.78174879e+00],
       [4.46000000e+02, 2.50000000e+02, 5.00000000e+02, 5.53000000e+02,
        5.00000000e+02, 8.00000000e+02, 8.95000000e+01, 3.13000000e+01,
        1.14920925e+00],
       [4.49000000e+02, 2.50000000e+02, 5.50000000e+02, 5.71000000e+02,
        5.00000000e+02, 8.00000000e+02, 8.00000000e+01, 2.78000000e+01,
        1.69867716e+00],
       [4.91000000e+02, 3.00000000e+02, 5.00000000e+02, 5.81000000e+02,
        5.50000000e+02, 6.50000000e+02, 9.50000000e+01, 2.37000000e+01,
        2.81000000e+00],
       [3.80000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 3.05000000e+01, 3.40000000e-01,
        1.90000000e-03],
       [3.60000000e+02, 2.50000000e+02, 3.80000000e+02, 5.45000000e+02,
        4.50000000e+02, 7.00000000e+02, 1.60000000e+00, 2.70000000e-01,
        7.80000000e-04],
       [3.70000000e+02, 2.90000000e+02, 3.80000000e+02, 6.11000000e+02,
        5.70000000e+02, 7.10000000e+02, 2.70000000e+01, 3.20000000e+00,
        7.00000000e-03],
       [3.50000000e+02, 2.40000000e+02, 4.20000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 3.40000000e+01, 4.30000000e+00,
        8.43871139e-01],
       [3.25000000e+02, 2.40000000e+02, 4.00000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 8.00000000e+00, 1.20000000e+00,
        8.08810112e-01],
       [3.70000000e+02, 2.40000000e+02, 4.00000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 6.30000000e+01, 9.00000000e+00,
        4.13560588e-01],
       [3.70000000e+02, 3.00000000e+02, 3.80000000e+02, 6.15000000e+02,
        5.70000000e+02, 7.10000000e+02, 6.10000000e+01, 1.20000000e+00,
        2.00000000e-01],
       [3.80000000e+02, 2.40000000e+02, 3.80000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 2.30000000e+01, 4.30000000e-01,
        3.00000000e-02],
       [3.60000000e+02, 2.40000000e+02, 3.80000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.10000000e+02, 3.00000000e+01, 1.00000000e-02,
        6.00000000e-04],
       [3.60000000e+02, 2.40000000e+02, 4.20000000e+02, 6.15000000e+02,
        5.70000000e+02, 7.10000000e+02, 8.50000000e+01, 2.30000000e+00,
        8.60000000e-04],
       [3.40000000e+02, 2.50000000e+02, 3.80000000e+02, 6.13000000e+02,
        5.70000000e+02, 7.10000000e+02, 4.40000000e+01, 4.95152985e+00,
        4.41000000e-02],
       [4.05000000e+02, 2.50000000e+02, 4.20000000e+02, 6.13000000e+02,
        5.70000000e+02, 7.10000000e+02, 4.40000000e+01, 5.16163221e+00,
        4.99000000e-02],
       [3.50000000e+02, 2.50000000e+02, 4.00000000e+02, 6.13000000e+02,
        5.70000000e+02, 7.10000000e+02, 7.30000000e+01, 2.80000000e-01,
        2.80000000e-01],
       [3.20000000e+02, 2.50000000e+02, 3.50000000e+02, 6.13000000e+02,
        5.70000000e+02, 7.10000000e+02, 8.60000000e+01, 6.49216118e+00,
        1.22326850e-01],
       [3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.15000000e+02,
        5.70000000e+02, 7.10000000e+02, 8.90000000e+01, 7.00000000e-01,
        6.61502043e-01],
       [3.70000000e+02, 3.00000000e+02, 4.50000000e+02, 6.15000000e+02,
        5.70000000e+02, 7.10000000e+02, 8.90000000e+01, 9.63044260e+00,
        8.00000000e-02],
       [3.96000000e+02, 3.50000000e+02, 4.50000000e+02, 5.82000000e+02,
        5.00000000e+02, 7.00000000e+02, 5.30000000e+01, 8.78474482e+00,
        1.08010617e+00],
       [3.20000000e+02, 3.00000000e+02, 5.50000000e+02, 6.70000000e+02,
        6.00000000e+02, 7.00000000e+02, 3.29700000e+01, 2.55000000e+00,
        1.99733817e+00],
       [4.04000000e+02, 3.00000000e+02, 5.50000000e+02, 5.94000000e+02,
        5.00000000e+02, 7.50000000e+02, 8.64000000e+01, 2.60000000e+00,
        2.30000000e+00],
       [3.70000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.20000000e+02, 6.00000000e+01, 2.00000000e-02,
        1.98000000e-04],
       [3.70000000e+02, 2.50000000e+02, 4.00000000e+02, 6.12000000e+02,
        5.70000000e+02, 7.20000000e+02, 6.00000000e+01, 4.80000000e-02,
        4.71000000e-04]])
In [ ]:
Xr, W, H, cost = nmf.nmf_mu(X, k=solution[0], seed=solution[1])
_, cols = X.shape
field_names = df.columns
print(f'{field_names}')
df = pl.DataFrame({field_names[i+2]: X[:,i] for i in range(cols)})
df
['#', 'mat0', 'mat1', 'abs_peak', 'abs_min', 'abs_max', 'em_peak', 'em_min', 'em_max', 'QY (%)', 'hopt (%)', 'PCE (%)']
Out[ ]:
shape: (28, 9)
mat1abs_peakabs_minabs_maxem_peakem_minem_maxQY (%)hopt (%)
f64f64f64f64f64f64f64f64f64
370.0300.0450.0650.0450.0750.067.05.51.305575
375.0300.0450.0750.0400.0900.045.04.251.33
480.0300.0500.0619.0570.0670.036.22.952.25
400.0300.0500.0600.0500.0700.025.01.851.781749
446.0250.0500.0553.0500.0800.089.531.31.149209
449.0250.0550.0571.0500.0800.080.027.81.698677
491.0300.0500.0581.0550.0650.095.023.72.81
380.0250.0400.0612.0570.0710.030.50.340.0019
360.0250.0380.0545.0450.0700.01.60.270.00078
370.0290.0380.0611.0570.0710.027.03.20.007
350.0240.0420.0612.0570.0710.034.04.30.843871
325.0240.0400.0612.0570.0710.08.01.20.80881
370.0240.0400.0612.0570.0710.063.09.00.413561
370.0300.0380.0615.0570.0710.061.01.20.2
380.0240.0380.0612.0570.0710.023.00.430.03
360.0240.0380.0612.0570.0710.030.00.010.0006
360.0240.0420.0615.0570.0710.085.02.30.00086
340.0250.0380.0613.0570.0710.044.04.951530.0441
405.0250.0420.0613.0570.0710.044.05.1616320.0499
350.0250.0400.0613.0570.0710.073.00.280.28
320.0250.0350.0613.0570.0710.086.06.4921610.122327
370.0300.0450.0615.0570.0710.089.00.70.661502
370.0300.0450.0615.0570.0710.089.09.6304430.08
396.0350.0450.0582.0500.0700.053.08.7847451.080106
320.0300.0550.0670.0600.0700.032.972.551.997338
404.0300.0550.0594.0500.0750.086.42.62.3
370.0250.0400.0612.0570.0720.060.00.020.000198
370.0250.0400.0612.0570.0720.060.00.0480.000471
In [ ]: